acs <- read.table("http://jaredlander.com/data/acs_ny.csv",sep=",",
header=TRUE, stringsAsFactors=TRUE)
library(ggplot2)
library(useful)
library(caret)
## Loading required package: lattice
library(ISLR)
library(scales)
library(plyr)
library(rpart)
library(rpart.plot)
library(class)
library(MuMIn)
library(caret)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RColorBrewer)
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:MuMIn':
##
## importance
## The following object is masked from 'package:ggplot2':
##
## margin
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:scales':
##
## alpha
## The following object is masked from 'package:ggplot2':
##
## alpha
library(nlme)
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
library(lme4)
## Loading required package: Matrix
##
## Attaching package: 'lme4'
## The following object is masked from 'package:nlme':
##
## lmList
library(mgcv)
## This is mgcv 1.8-11. For overview type 'help("mgcv-package")'.
Let’s assume our goal is to build a model to predict if household income is greater than $250,000 per year.
Start by building a binary response variable.
acs$HighIncome <- as.numeric(with(acs, FamilyIncome >= 250000))
head(acs)
## Acres FamilyIncome FamilyType NumBedrooms NumChildren NumPeople
## 1 1-10 150 Married 4 1 3
## 2 1-10 180 Female Head 3 2 4
## 3 1-10 280 Female Head 4 0 2
## 4 1-10 330 Female Head 2 1 2
## 5 1-10 330 Male Head 3 1 2
## 6 1-10 480 Male Head 0 3 4
## NumRooms NumUnits NumVehicles NumWorkers OwnRent YearBuilt
## 1 9 Single detached 1 0 Mortgage 1950-1959
## 2 6 Single detached 2 0 Rented Before 1939
## 3 8 Single detached 3 1 Mortgage 2000-2004
## 4 4 Single detached 1 0 Rented 1950-1959
## 5 5 Single attached 1 0 Mortgage Before 1939
## 6 1 Single detached 0 0 Rented Before 1939
## HouseCosts ElectricBill FoodStamp HeatingFuel Insurance Language
## 1 1800 90 No Gas 2500 English
## 2 850 90 No Oil 0 English
## 3 2600 260 No Oil 6600 Other European
## 4 1800 140 No Oil 0 English
## 5 860 150 No Gas 660 Spanish
## 6 700 140 No Gas 0 English
## HighIncome
## 1 0
## 2 0
## 3 0
## 4 0
## 5 0
## 6 0
tail(acs)
## Acres FamilyIncome FamilyType NumBedrooms NumChildren NumPeople
## 22740 10+ 556250 Married 4 0 2
## 22741 10+ 565000 Married 5 3 5
## 22742 10+ 599000 Married 4 0 2
## 22743 10+ 611700 Married 4 1 5
## 22744 10+ 621430 Married 3 2 4
## 22745 10+ 751200 Married 4 2 4
## NumRooms NumUnits NumVehicles NumWorkers OwnRent YearBuilt
## 22740 10 Single detached 2 1 Mortgage Before 1939
## 22741 10 Single detached 2 2 Mortgage 1990-1999
## 22742 6 Single detached 2 2 Mortgage Before 1939
## 22743 9 Single detached 5 3 Mortgage Before 1939
## 22744 11 Single detached 2 3 Mortgage 1970-1979
## 22745 10 Single detached 2 2 Mortgage 1940-1949
## HouseCosts ElectricBill FoodStamp HeatingFuel Insurance Language
## 22740 1800 380 No Oil 1700 English
## 22741 1700 370 No Gas 1000 English
## 22742 1300 100 No Gas 3500 English
## 22743 410 100 No Oil 1300 Spanish
## 22744 1600 80 No Gas 800 Spanish
## 22745 6500 400 No Oil 2000 English
## HighIncome
## 22740 1
## 22741 1
## 22742 1
## 22743 1
## 22744 1
## 22745 1
acs$foodstamp_binary <- ifelse(acs$FoodStamp == "Yes",1,0) # (yes = 1, no = 0)
acs$own_home <- ifelse(acs$OwnRent == "Rented",0, ifelse(acs$FamilyIncome == "Mortgage",1,2)) # (own = 1, rent = 0)
acs$family_type_cat <- ifelse(acs$FamilyType == "Married",1,
ifelse(acs$FamilyIncome == "Female Head",2,3)) # married = 1, male head = 2, female head = 3
acs$InsuranceHigh <- (acs$Insurance > 1000) * 1
acs$NumWorkers2 <- (acs$NumWorkers == 2) * 1
acs$HouseCostsHigh <- (acs$HouseCosts > 1000) * 1
acs$high_electric <- (acs$ElectricBill > 350) * 1
set.seed(447)
testrecs <- sample(nrow(acs),0.2 * nrow(acs))
acs_test <- acs[testrecs,]
acs_fit <- acs[-testrecs,]
acs$HI_pred1 <- 0
acs$HI_pred1[acs_test$FoodStamp == 'No' & acs_test$OwnRent != 'Rented' & acs_test$FamilyType == 'Married'] <- 1
# Nice visualization for a quick visual of what you're dealing with before digging in
ggplot(acs,aes(x=FamilyIncome)) + geom_density(fill="#31a354", color="#31a354") +
geom_vline(xintercept=250000) + scale_x_continuous(label=multiple.dollar, limits=c(0,1000000))
## Warning: Removed 13 rows containing non-finite values (stat_density).
Downside is it has a 5000 observation limit. If p-value is less than .05 (or chosen significance level) then sample is NOT normally distributed. This is not relevant here, but worth keeping for future. The plot shows that there is a clear left skewned distribution - expected but still nice to include
shapiro.test(acs_test$FamilyIncome)
##
## Shapiro-Wilk normality test
##
## data: acs_test$FamilyIncome
## W = 0.71538, p-value < 2.2e-16
Before trying to build any classification models, do some exploratory data analysis to try to get a sense of which variables might be useful for trying to predict cases where FamilyIncome >= 250000. You should use a combination of group by analysis (i.e. plyr or dplyr or similar) and plotting.
# Get some summary stats on each variable
summary(acs_fit)
## Acres FamilyIncome FamilyType NumBedrooms
## 10+ : 815 Min. : 50 Female Head: 2594 Min. :0.000
## 1-10 : 3709 1st Qu.: 53000 Male Head : 907 1st Qu.:3.000
## Sub 1:13672 Median : 87100 Married :14695 Median :3.000
## Mean : 110726 Mean :3.384
## 3rd Qu.: 134082 3rd Qu.:4.000
## Max. :1605000 Max. :8.000
##
## NumChildren NumPeople NumRooms NumUnits
## Min. : 0.0000 Min. : 2.0 Min. : 1.00 Mobile home : 583
## 1st Qu.: 0.0000 1st Qu.: 2.0 1st Qu.: 6.00 Single attached: 1916
## Median : 0.0000 Median : 3.0 Median : 7.00 Single detached:15697
## Mean : 0.9013 Mean : 3.4 Mean : 7.18
## 3rd Qu.: 2.0000 3rd Qu.: 4.0 3rd Qu.: 8.00
## Max. :12.0000 Max. :18.0 Max. :21.00
##
## NumVehicles NumWorkers OwnRent YearBuilt
## Min. :0.000 Min. :0.000 Mortgage:16074 Before 1939:4874
## 1st Qu.:2.000 1st Qu.:1.000 Outright: 128 1950-1959 :3107
## Median :2.000 Median :2.000 Rented : 1994 1960-1969 :2201
## Mean :2.119 Mean :1.748 1970-1979 :1894
## 3rd Qu.:3.000 3rd Qu.:2.000 1990-1999 :1647
## Max. :6.000 Max. :3.000 1980-1989 :1598
## (Other) :2875
## HouseCosts ElectricBill FoodStamp HeatingFuel
## Min. : 4 Min. : 1.0 No :16777 Gas :10899
## 1st Qu.: 650 1st Qu.:100.0 Yes: 1419 Oil : 5187
## Median :1200 Median :150.0 Wood : 986
## Mean :1480 Mean :174.4 Electricity: 831
## 3rd Qu.:2000 3rd Qu.:220.0 Other : 148
## Max. :7090 Max. :580.0 Coal : 124
## (Other) : 21
## Insurance Language HighIncome
## Min. : 0.0 Asian Pacific : 639 Min. :0.00000
## 1st Qu.: 400.0 English :14262 1st Qu.:0.00000
## Median : 720.0 Other : 225 Median :0.00000
## Mean : 958.9 Other European: 1640 Mean :0.06139
## 3rd Qu.:1200.0 Spanish : 1430 3rd Qu.:0.00000
## Max. :6600.0 Max. :1.00000
##
## foodstamp_binary own_home family_type_cat InsuranceHigh
## Min. :0.00000 Min. :0.000 Min. :1.000 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:2.000 1st Qu.:1.000 1st Qu.:0.000
## Median :0.00000 Median :2.000 Median :1.000 Median :0.000
## Mean :0.07798 Mean :1.781 Mean :1.385 Mean :0.326
## 3rd Qu.:0.00000 3rd Qu.:2.000 3rd Qu.:1.000 3rd Qu.:1.000
## Max. :1.00000 Max. :2.000 Max. :3.000 Max. :1.000
##
## NumWorkers2 HouseCostsHigh high_electric
## Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.0000 Median :1.0000 Median :0.00000
## Mean :0.4768 Mean :0.5433 Mean :0.06666
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.00000
##
# see that those that those that own home correlate with higher incomes overall
ggplot(acs_fit) + geom_histogram(aes(x=own_home), fill = "gray")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
# scatter number of workers and family income
ggplot(data=acs_fit) + geom_point(aes(x=NumWorkers, y=FamilyIncome))
# scatter plot shows that those not on foodstamps tend to have higher income = duh, but relevant for model later
ggplot(data=acs_fit) + geom_point(aes(x=foodstamp_binary, y=FamilyIncome))
# plot shows that homes with 2 workers correlate with higher incomes vs other number of workers
ggplot(data=acs_fit) + geom_point(aes(x=NumWorkers, y=FamilyIncome))
# notice that there are very few observations with male head type. Female head has lower income and married highest incomes
ggplot(data=acs_fit) + geom_point(aes(x=family_type_cat, y=FamilyIncome))
# scatter house costs and family income - see that higher house costs correlate to higher incomes (slightly) - nothin major though
ggplot(data=acs_fit) + geom_point(aes(x=HouseCosts, y=FamilyIncome))
# create matrix of scatterplots
##pairs(acs[,1:19])
coor_cartesian -> Setting limits on the coordinate system will zoom the plot (like you’re looking at it with a magnifying glass), and will not change the underlying data like setting limits on a scale will.
# See that outliers begin roughly around income of $100,000
ggplot(data=acs_fit) + geom_boxplot(aes(x=NumWorkers, y=FamilyIncome)) + coord_cartesian(ylim = c(0, 350000))
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?
These show the density by variable on axis. These are useful to see the concentration range of values
ggplot(acs_fit) + geom_density(aes(x=acs_fit$FamilyIncome)) + scale_x_continuous(labels=dollar)
ggplot(acs_fit) + geom_density(aes(x=acs_fit$HouseCosts)) + scale_x_continuous(labels=dollar)
ggplot(acs_fit) + geom_density(aes(x=acs_fit$NumChildren)) + scale_x_continuous()
ggplot(acs_fit) + geom_density(aes(x=acs_fit$FamilyIncome)) + scale_x_log10(breaks =c(100,1000,10000,100000), labels=dollar) + annotation_logticks(sides="bt")
ggplot(acs_fit) + geom_density(aes(x=acs_fit$HouseCosts)) + scale_x_log10(breaks =c(100,1000,10000,100000), labels=dollar) + annotation_logticks(sides="bt")
# shows positive correlation between insurance and family income
ggplot(acs_fit, aes(x=acs_fit$Insurance, y=acs_fit$FamilyIncome)) +geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam'
# density plot for electrical bill
ggplot(acs_fit) + geom_density(aes(x=acs_fit$ElectricBill)) + scale_x_log10(breaks =c(100,1000,10000,100000), labels=dollar) + annotation_logticks(sides="bt")
# shows positive correlation between electric bill and family income
ggplot(acs_fit, aes(x=acs_fit$ElectricBill, y=acs_fit$FamilyIncome)) +geom_point() + geom_smooth()
## `geom_smooth()` using method = 'gam'
# This shows a good spread or range of each family type group. This will lend itself well to being included in my analysis
ddply(acs_fit,.(FamilyType),summarise,family_type_count=length(FamilyIncome))
## FamilyType family_type_count
## 1 Female Head 2594
## 2 Male Head 907
## 3 Married 14695
# Interesting look at mean income of family type grouped with home ownership type
ddply(acs_fit,.(FamilyType,OwnRent), summarise, mean_income=mean(FamilyIncome))
## FamilyType OwnRent mean_income
## 1 Female Head Mortgage 69983.15
## 2 Female Head Outright 92303.57
## 3 Female Head Rented 34258.14
## 4 Male Head Mortgage 79298.16
## 5 Male Head Outright 213250.00
## 6 Male Head Rented 47620.92
## 7 Married Mortgage 126152.48
## 8 Married Outright 110020.36
## 9 Married Rented 71331.00
ddply(acs_fit,.(FamilyType,FoodStamp), summarise, mean_income=mean(FamilyIncome))
## FamilyType FoodStamp mean_income
## 1 Female Head No 67144.55
## 2 Female Head Yes 37105.47
## 3 Male Head No 78679.59
## 4 Male Head Yes 45825.75
## 5 Married No 124832.07
## 6 Married Yes 61612.91
# simple look at mean income by foodstamp. Obvious results, but at the same time surprising to find that mean income for those on food stamps in near $50k
ddply(acs_fit,.(FoodStamp), summarise, mean_income=mean(FamilyIncome))
## FoodStamp mean_income
## 1 No 115898.77
## 2 Yes 49572.72
ddply(acs_fit,.(FoodStamp,NumBedrooms), summarise, mean_income=mean(FamilyIncome), num_bedrooms=mean(NumBedrooms))
## FoodStamp NumBedrooms mean_income num_bedrooms
## 1 No 0 91092.92 0
## 2 No 1 68606.42 1
## 3 No 2 80216.78 2
## 4 No 3 102048.19 3
## 5 No 4 137687.35 4
## 6 No 5 173357.12 5
## 7 No 8 182832.27 8
## 8 Yes 0 36577.50 0
## 9 Yes 1 22038.06 1
## 10 Yes 2 33118.22 2
## 11 Yes 3 45066.36 3
## 12 Yes 4 62828.82 4
## 13 Yes 5 60499.15 5
## 14 Yes 8 82773.62 8
# This is a little excessive, but would be useful for piping it to a csv file (for example) if relevant to tasks at hand
##ddply(acs,.(NumBedrooms,NumChildren,NumPeople,NumRooms,NumUnits,NumVehicles,NumWorkers), summarise, mean_income=mean(FamilyIncome))
ddply(acs_fit,.(OwnRent), summarise, mean_income=mean(FamilyIncome))
## OwnRent mean_income
## 1 Mortgage 117551.80
## 2 Outright 109695.55
## 3 Rented 55771.63
# Family Type
tapply(acs_fit$FamilyIncome,acs_fit$FamilyType,length)
## Female Head Male Head Married
## 2594 907 14695
tapply(acs_fit$FamilyIncome,acs_fit$FamilyType,mean)
## Female Head Male Head Married
## 60242.74 72992.65 121966.88
# Own/Rent
tapply(acs_fit$FamilyIncome,acs_fit$OwnRent,length)
## Mortgage Outright Rented
## 16074 128 1994
tapply(acs_fit$FamilyIncome,acs_fit$OwnRent,mean)
## Mortgage Outright Rented
## 117551.80 109695.55 55771.63
# Insurance
tapply(acs_fit$FamilyIncome,acs_fit$FoodStamp,length)
## No Yes
## 16777 1419
tapply(acs_fit$FamilyIncome,acs_fit$FoodStamp,mean)
## No Yes
## 115898.77 49572.72
Start by building a null model in which you simply predict that everyone’s income is < 250000 (since the majority of incomes are less than 250000).
acs$null_model <- 0
table(acs$HighIncome, acs$null_model)
##
## 0
## 0 21356
## 1 1389
prop.table(table(acs$HighIncome, acs$null_model))
##
## 0
## 0 0.93893163
## 1 0.06106837
confusionMatrix(as.factor(acs$null_model), as.factor(acs$HighIncome), positive = "1")
## Warning in confusionMatrix.default(as.factor(acs$null_model), as.factor(acs
## $HighIncome), : Levels are not in the same order for reference and data.
## Refactoring data to match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 21356 1389
## 1 0 0
##
## Accuracy : 0.9389
## 95% CI : (0.9357, 0.942)
## No Information Rate : 0.9389
## P-Value [Acc > NIR] : 0.5071
##
## Kappa : 0
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.93893
## Prevalence : 0.06107
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : 1
##
logmod1 <- glm(HighIncome ~ FamilyType + NumVehicles + OwnRent + Insurance + YearBuilt, data=acs_fit,
family=binomial(link="logit"))
summary(logmod1)
##
## Call:
## glm(formula = HighIncome ~ FamilyType + NumVehicles + OwnRent +
## Insurance + YearBuilt, family = binomial(link = "logit"),
## data = acs_fit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.9844 -0.3540 -0.2875 -0.1979 3.3452
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.606e+01 1.539e+02 -0.104 0.917
## FamilyTypeMale Head 6.722e-01 3.116e-01 2.157 0.031 *
## FamilyTypeMarried 1.674e+00 2.035e-01 8.225 < 2e-16 ***
## NumVehicles 2.463e-01 3.374e-02 7.300 2.88e-13 ***
## OwnRentOutright 7.038e-01 3.182e-01 2.212 0.027 *
## OwnRentRented -1.159e-01 1.937e-01 -0.598 0.550
## Insurance 6.605e-04 2.212e-05 29.855 < 2e-16 ***
## YearBuilt1940-1949 9.780e+00 1.539e+02 0.064 0.949
## YearBuilt1950-1959 1.034e+01 1.539e+02 0.067 0.946
## YearBuilt1960-1969 1.036e+01 1.539e+02 0.067 0.946
## YearBuilt1970-1979 1.015e+01 1.539e+02 0.066 0.947
## YearBuilt1980-1989 1.049e+01 1.539e+02 0.068 0.946
## YearBuilt1990-1999 1.050e+01 1.539e+02 0.068 0.946
## YearBuilt2000-2004 1.061e+01 1.539e+02 0.069 0.945
## YearBuilt2005 1.071e+01 1.539e+02 0.070 0.945
## YearBuilt2006 1.105e+01 1.539e+02 0.072 0.943
## YearBuilt2007 1.074e+01 1.539e+02 0.070 0.944
## YearBuilt2008 1.080e+01 1.539e+02 0.070 0.944
## YearBuilt2009 1.046e+01 1.539e+02 0.068 0.946
## YearBuilt2010 1.135e+01 1.539e+02 0.074 0.941
## YearBuiltBefore 1939 1.033e+01 1.539e+02 0.067 0.946
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8398.1 on 18195 degrees of freedom
## Residual deviance: 7076.0 on 18175 degrees of freedom
## AIC: 7118
##
## Number of Fisher Scoring iterations: 12
acs_test$yhat_logmod1 <- predict(logmod1, newdata=acs_test, type='response')
acs_test$yhat_logmod1 <- (acs_test$yhat_logmod1 > 0.05) * 1
log_mod1_aic <- summary(logmod1)$aic
log_cm1 <- confusionMatrix(as.factor(acs_test$yhat_logmod1), as.factor(acs_test$HighIncome), positive = "1")
log_cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2662 57
## 1 1615 215
##
## Accuracy : 0.6324
## 95% CI : (0.6182, 0.6465)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1121
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.79044
## Specificity : 0.62240
## Pos Pred Value : 0.11749
## Neg Pred Value : 0.97904
## Prevalence : 0.05979
## Detection Rate : 0.04726
## Detection Prevalence : 0.40229
## Balanced Accuracy : 0.70642
##
## 'Positive' Class : 1
##
logmod2 <- glm(HighIncome ~ FamilyType + FoodStamp + OwnRent, data=acs_fit, family=binomial(link="logit"))
summary(logmod2)
##
## Call:
## glm(formula = HighIncome ~ FamilyType + FoodStamp + OwnRent,
## family = binomial(link = "logit"), data = acs_fit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4992 -0.4048 -0.4048 -0.1728 3.7799
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.1971 0.1947 -21.557 < 2e-16 ***
## FamilyTypeMale Head 0.6399 0.3026 2.114 0.0345 *
## FamilyTypeMarried 1.7363 0.1968 8.821 < 2e-16 ***
## FoodStampYes -1.9012 0.3578 -5.314 1.08e-07 ***
## OwnRentOutright 0.4413 0.2964 1.489 0.1365
## OwnRentRented -1.0447 0.1885 -5.541 3.00e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8398.1 on 18195 degrees of freedom
## Residual deviance: 8037.1 on 18190 degrees of freedom
## AIC: 8049.1
##
## Number of Fisher Scoring iterations: 7
acs_test$yhat_logmod2 <- predict(logmod2, newdata=acs_test, type='response')
acs_test$yhat_logmod2 <- (acs_test$yhat_logmod2 > 0.05) * 1
log_mod2_aic <- summary(logmod2)$aic
log_cm2 <- confusionMatrix(as.factor(acs_test$yhat_logmod2), as.factor(acs_test$HighIncome), positive = "1")
log_cm2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1265 28
## 1 3012 244
##
## Accuracy : 0.3317
## 95% CI : (0.318, 0.3456)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0314
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.89706
## Specificity : 0.29577
## Pos Pred Value : 0.07494
## Neg Pred Value : 0.97834
## Prevalence : 0.05979
## Detection Rate : 0.05364
## Detection Prevalence : 0.71576
## Balanced Accuracy : 0.59641
##
## 'Positive' Class : 1
##
logmod3 <- glm(HighIncome ~ InsuranceHigh + NumWorkers2 + HouseCostsHigh + FoodStamp + OwnRent,
data=acs_fit, family=binomial(link="logit"))
summary(logmod3)
##
## Call:
## glm(formula = HighIncome ~ InsuranceHigh + NumWorkers2 + HouseCostsHigh +
## FoodStamp + OwnRent, family = binomial(link = "logit"), data = acs_fit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2253 -0.3183 -0.2682 -0.1640 3.6467
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.30241 0.09797 -43.915 < 2e-16 ***
## InsuranceHigh 1.34537 0.07504 17.929 < 2e-16 ***
## NumWorkers2 0.07590 0.06409 1.184 0.2363
## HouseCostsHigh 1.26356 0.09694 13.035 < 2e-16 ***
## FoodStampYes -2.07741 0.35866 -5.792 6.95e-09 ***
## OwnRentOutright 1.72956 0.31359 5.515 3.48e-08 ***
## OwnRentRented -0.34415 0.19554 -1.760 0.0784 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8398.1 on 18195 degrees of freedom
## Residual deviance: 7305.5 on 18189 degrees of freedom
## AIC: 7319.5
##
## Number of Fisher Scoring iterations: 7
acs_test$yhat_logmod3 <- predict(logmod3, newdata=acs_test, type='response')
acs_test$yhat_logmod3 <- (acs_test$yhat_logmod3 > 0.05) * 1
log_mod3_aic <- summary(logmod3)$aic
log_cm3 <- confusionMatrix(as.factor(acs_test$yhat_logmod3), as.factor(acs_test$HighIncome), positive = "1")
log_cm3
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3090 72
## 1 1187 200
##
## Accuracy : 0.7232
## 95% CI : (0.71, 0.7362)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1568
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.73529
## Specificity : 0.72247
## Pos Pred Value : 0.14420
## Neg Pred Value : 0.97723
## Prevalence : 0.05979
## Detection Rate : 0.04397
## Detection Prevalence : 0.30490
## Balanced Accuracy : 0.72888
##
## 'Positive' Class : 1
##
logmod4 <- glm(HighIncome ~ InsuranceHigh + NumWorkers2 + HouseCostsHigh, data=acs_fit, family=binomial(link="logit"))
summary(logmod4)
##
## Call:
## glm(formula = HighIncome ~ InsuranceHigh + NumWorkers2 + HouseCostsHigh,
## family = binomial(link = "logit"), data = acs_fit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5958 -0.3163 -0.2879 -0.1577 2.9642
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.38071 0.09202 -47.604 <2e-16 ***
## InsuranceHigh 1.41041 0.07252 19.447 <2e-16 ***
## NumWorkers2 0.11334 0.06374 1.778 0.0753 .
## HouseCostsHigh 1.21827 0.09407 12.950 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8398.1 on 18195 degrees of freedom
## Residual deviance: 7404.8 on 18192 degrees of freedom
## AIC: 7412.8
##
## Number of Fisher Scoring iterations: 6
acs_test$yhat_logmod4 <- predict(logmod4, newdata=acs_test, type='response')
acs_test$yhat_logmod4 <- (acs_test$yhat_logmod4 > 0.05) * 1
log_mod4_aic <- summary(logmod4)$aic
log_cm4 <- confusionMatrix(as.factor(acs_test$yhat_logmod4), as.factor(acs_test$HighIncome), positive = "1")
log_cm4
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3072 73
## 1 1205 199
##
## Accuracy : 0.7191
## 95% CI : (0.7058, 0.7321)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1526
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.73162
## Specificity : 0.71826
## Pos Pred Value : 0.14174
## Neg Pred Value : 0.97679
## Prevalence : 0.05979
## Detection Rate : 0.04375
## Detection Prevalence : 0.30864
## Balanced Accuracy : 0.72494
##
## 'Positive' Class : 1
##
logmod5 <- glm(HighIncome ~ FamilyType + NumBedrooms + NumChildren + NumPeople + NumRooms + NumUnits + NumVehicles +
NumWorkers + OwnRent + HouseCosts + ElectricBill + FoodStamp + Insurance + Language +
InsuranceHigh + NumWorkers2 + HouseCostsHigh, data=acs_fit, family=binomial(link="logit"))
summary(logmod5)
##
## Call:
## glm(formula = HighIncome ~ FamilyType + NumBedrooms + NumChildren +
## NumPeople + NumRooms + NumUnits + NumVehicles + NumWorkers +
## OwnRent + HouseCosts + ElectricBill + FoodStamp + Insurance +
## Language + InsuranceHigh + NumWorkers2 + HouseCostsHigh,
## family = binomial(link = "logit"), data = acs_fit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5995 -0.3188 -0.2038 -0.1272 3.6788
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.020e+01 1.550e+02 -0.130 0.896356
## FamilyTypeMale Head 5.779e-01 3.213e-01 1.798 0.072102 .
## FamilyTypeMarried 1.436e+00 2.069e-01 6.937 4.00e-12 ***
## NumBedrooms 1.191e-01 3.507e-02 3.396 0.000685 ***
## NumChildren 1.853e-01 5.317e-02 3.486 0.000491 ***
## NumPeople -2.690e-01 5.039e-02 -5.338 9.39e-08 ***
## NumRooms 1.064e-01 1.426e-02 7.461 8.61e-14 ***
## NumUnitsSingle attached 1.320e+01 1.550e+02 0.085 0.932166
## NumUnitsSingle detached 1.284e+01 1.550e+02 0.083 0.933970
## NumVehicles 2.531e-01 4.244e-02 5.963 2.48e-09 ***
## NumWorkers 1.778e-01 5.742e-02 3.096 0.001964 **
## OwnRentOutright 1.930e+00 3.441e-01 5.609 2.04e-08 ***
## OwnRentRented 3.982e-01 2.026e-01 1.965 0.049373 *
## HouseCosts 4.985e-04 2.945e-05 16.926 < 2e-16 ***
## ElectricBill 2.001e-03 2.980e-04 6.714 1.89e-11 ***
## FoodStampYes -1.255e+00 3.727e-01 -3.367 0.000760 ***
## Insurance 2.365e-04 3.125e-05 7.568 3.79e-14 ***
## LanguageEnglish -2.851e-01 1.629e-01 -1.750 0.080110 .
## LanguageOther -1.487e-01 2.899e-01 -0.513 0.607872
## LanguageOther European -1.955e-01 1.839e-01 -1.063 0.287918
## LanguageSpanish -6.078e-01 2.096e-01 -2.899 0.003739 **
## InsuranceHigh 4.681e-01 9.381e-02 4.990 6.05e-07 ***
## NumWorkers2 3.137e-02 7.781e-02 0.403 0.686814
## HouseCostsHigh 1.797e-01 1.155e-01 1.555 0.119832
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8398.1 on 18195 degrees of freedom
## Residual deviance: 6140.7 on 18172 degrees of freedom
## AIC: 6188.7
##
## Number of Fisher Scoring iterations: 16
acs_test$yhat_logmod5 <- predict(logmod5, newdata=acs_test, type='response')
acs_test$yhat_logmod5 <- (acs_test$yhat_logmod5 > 0.05) * 1
log_mod5_aic <- summary(logmod5)$aic
log_cm5 <- confusionMatrix(as.factor(acs_test$yhat_logmod5), as.factor(acs_test$HighIncome), positive = "1")
log_cm5
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3145 50
## 1 1132 222
##
## Accuracy : 0.7402
## 95% CI : (0.7272, 0.7529)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1927
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.81618
## Specificity : 0.73533
## Pos Pred Value : 0.16396
## Neg Pred Value : 0.98435
## Prevalence : 0.05979
## Detection Rate : 0.04880
## Detection Prevalence : 0.29765
## Balanced Accuracy : 0.77575
##
## 'Positive' Class : 1
##
logmod6 <- glm(HighIncome ~ FamilyType + NumBedrooms + NumChildren + OwnRent +
HouseCosts + ElectricBill + FoodStamp + InsuranceHigh,
data=acs_fit, family=binomial(link="logit"))
summary(logmod6)
##
## Call:
## glm(formula = HighIncome ~ FamilyType + NumBedrooms + NumChildren +
## OwnRent + HouseCosts + ElectricBill + FoodStamp + InsuranceHigh,
## family = binomial(link = "logit"), data = acs_fit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1642 -0.3266 -0.2067 -0.1488 3.8673
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.125e+00 2.339e-01 -30.467 < 2e-16 ***
## FamilyTypeMale Head 5.904e-01 3.158e-01 1.869 0.0616 .
## FamilyTypeMarried 1.624e+00 2.028e-01 8.006 1.19e-15 ***
## NumBedrooms 2.568e-01 2.700e-02 9.514 < 2e-16 ***
## NumChildren -7.037e-02 2.962e-02 -2.376 0.0175 *
## OwnRentOutright 1.963e+00 3.122e-01 6.287 3.24e-10 ***
## OwnRentRented 5.207e-02 1.983e-01 0.263 0.7929
## HouseCosts 5.746e-04 2.467e-05 23.294 < 2e-16 ***
## ElectricBill 2.358e-03 2.826e-04 8.341 < 2e-16 ***
## FoodStampYes -1.764e+00 3.640e-01 -4.846 1.26e-06 ***
## InsuranceHigh 7.961e-01 8.077e-02 9.856 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8398.1 on 18195 degrees of freedom
## Residual deviance: 6352.8 on 18185 degrees of freedom
## AIC: 6374.8
##
## Number of Fisher Scoring iterations: 8
acs_test$yhat_logmod6 <- predict(logmod6, newdata=acs_test, type='response')
acs_test$yhat_logmod6 <- (acs_test$yhat_logmod6 > 0.05) * 1
log_mod6_aic <- summary(logmod6)$aic
log_cm6 <- confusionMatrix(as.factor(acs_test$yhat_logmod6), as.factor(acs_test$HighIncome), positive = "1")
log_cm6
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3090 55
## 1 1187 217
##
## Accuracy : 0.727
## 95% CI : (0.7138, 0.7399)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1764
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.79779
## Specificity : 0.72247
## Pos Pred Value : 0.15456
## Neg Pred Value : 0.98251
## Prevalence : 0.05979
## Detection Rate : 0.04770
## Detection Prevalence : 0.30864
## Balanced Accuracy : 0.76013
##
## 'Positive' Class : 1
##
linear_mod1 <- lm(FamilyIncome ~ FamilyType + FoodStamp + OwnRent + HouseCosts + Insurance + ElectricBill +
NumRooms, data=acs_fit)
summary(linear_mod1)
##
## Call:
## lm(formula = FamilyIncome ~ FamilyType + FoodStamp + OwnRent +
## HouseCosts + Insurance + ElectricBill + NumRooms, data = acs_fit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -372890 -39833 -9227 22628 1198390
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.550e+04 2.699e+03 -9.448 < 2e-16 ***
## FamilyTypeMale Head 8.939e+03 3.233e+03 2.765 0.0057 **
## FamilyTypeMarried 3.814e+04 1.866e+03 20.435 < 2e-16 ***
## FoodStampYes -2.614e+04 2.485e+03 -10.519 < 2e-16 ***
## OwnRentOutright 4.074e+04 7.468e+03 5.455 4.95e-08 ***
## OwnRentRented -1.391e+03 2.241e+03 -0.620 0.5350
## HouseCosts 2.738e+01 6.451e-01 42.439 < 2e-16 ***
## Insurance 1.839e+01 7.637e-01 24.078 < 2e-16 ***
## ElectricBill 5.154e+01 6.313e+00 8.163 3.48e-16 ***
## NumRooms 5.537e+03 2.793e+02 19.823 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 83680 on 18186 degrees of freedom
## Multiple R-squared: 0.3138, Adjusted R-squared: 0.3135
## F-statistic: 924.1 on 9 and 18186 DF, p-value: < 2.2e-16
acs_test$lin_mod1_FamilyIncome <- predict(linear_mod1, newdata=acs_test)
acs_test$lin_mod1_HighIncome <- ifelse(acs_test$lin_mod1_FamilyIncome > 250000,1,0)
linear_mod1_rsq <- summary(linear_mod1)$r.sq
linear_mod1_aic <- AIC(linear_mod1)
linear_cm1 <- confusionMatrix(as.factor(acs_test$lin_mod1_HighIncome), as.factor(acs_test$HighIncome), positive = "1")
linear_cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4230 206
## 1 47 66
##
## Accuracy : 0.9444
## 95% CI : (0.9373, 0.9509)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 0.123
##
## Kappa : 0.319
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.24265
## Specificity : 0.98901
## Pos Pred Value : 0.58407
## Neg Pred Value : 0.95356
## Prevalence : 0.05979
## Detection Rate : 0.01451
## Detection Prevalence : 0.02484
## Balanced Accuracy : 0.61583
##
## 'Positive' Class : 1
##
# Residual Analysis
summary(acs_test$HighIncome - predict(linear_mod1,newdata=acs_test))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -399700 -138300 -101600 -110700 -76090 28880
linear_mod2 <- lm(FamilyIncome ~ Insurance + HouseCosts + ElectricBill + NumWorkers + FamilyType +
FoodStamp + OwnRent + NumBedrooms + NumChildren + NumRooms + NumPeople +
NumVehicles + Language, data=acs_fit)
summary(linear_mod2)
##
## Call:
## lm(formula = FamilyIncome ~ Insurance + HouseCosts + ElectricBill +
## NumWorkers + FamilyType + FoodStamp + OwnRent + NumBedrooms +
## NumChildren + NumRooms + NumPeople + NumVehicles + Language,
## data = acs_fit)
##
## Residuals:
## Min 1Q Median 3Q Max
## -386370 -37621 -9512 20578 1182262
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.907e+04 4.482e+03 -10.947 < 2e-16 ***
## Insurance 1.891e+01 7.552e-01 25.047 < 2e-16 ***
## HouseCosts 2.791e+01 6.532e-01 42.723 < 2e-16 ***
## ElectricBill 4.849e+01 6.330e+00 7.659 1.96e-14 ***
## NumWorkers 1.865e+04 8.823e+02 21.136 < 2e-16 ***
## FamilyTypeMale Head 7.558e+03 3.179e+03 2.378 0.017433 *
## FamilyTypeMarried 3.036e+04 1.875e+03 16.191 < 2e-16 ***
## FoodStampYes -1.214e+04 2.548e+03 -4.766 1.89e-06 ***
## OwnRentOutright 5.753e+04 7.362e+03 7.814 5.83e-15 ***
## OwnRentRented 7.785e+03 2.241e+03 3.474 0.000513 ***
## NumBedrooms 2.337e+03 7.579e+02 3.083 0.002050 **
## NumChildren 3.963e+03 7.790e+02 5.087 3.67e-07 ***
## NumRooms 4.482e+03 3.381e+02 13.254 < 2e-16 ***
## NumPeople -7.638e+03 7.218e+02 -10.582 < 2e-16 ***
## NumVehicles 7.058e+03 7.393e+02 9.547 < 2e-16 ***
## LanguageEnglish 3.073e+03 3.400e+03 0.904 0.366076
## LanguageOther -1.447e+03 6.383e+03 -0.227 0.820722
## LanguageOther European -1.789e+03 3.850e+03 -0.465 0.642238
## LanguageSpanish -9.829e+03 3.931e+03 -2.500 0.012415 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 82010 on 18177 degrees of freedom
## Multiple R-squared: 0.3413, Adjusted R-squared: 0.3407
## F-statistic: 523.3 on 18 and 18177 DF, p-value: < 2.2e-16
acs_test$lin_mod2_FamilyIncome <- predict(linear_mod2, newdata=acs_test)
acs_test$lin_mod2_HighIncome <- ifelse(acs_test$lin_mod2_FamilyIncome > 250000,1,0)
linear_mod2_rsq <- summary(linear_mod2)$r.sq
linear_mod2_aic <- AIC(linear_mod2)
linear_cm2 <- confusionMatrix(as.factor(acs_test$lin_mod2_HighIncome), as.factor(acs_test$HighIncome), positive = "1")
linear_cm2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4227 207
## 1 50 65
##
## Accuracy : 0.9435
## 95% CI : (0.9364, 0.95)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 0.1827
##
## Kappa : 0.3114
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.23897
## Specificity : 0.98831
## Pos Pred Value : 0.56522
## Neg Pred Value : 0.95332
## Prevalence : 0.05979
## Detection Rate : 0.01429
## Detection Prevalence : 0.02528
## Balanced Accuracy : 0.61364
##
## 'Positive' Class : 1
##
# Residual Analysis
summary(acs_test$HighIncome - predict(linear_mod2,newdata=acs_test))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -437000 -139600 -103600 -110700 -74330 44130
“Finds the line/plane/hyperplane that separates the two groups of data as much as possible, and then see which side the new data points land on.” click the link below to see rest of SVM explanation. They give two great examples in simple terms in this forum:
svm_mod1 <- ksvm(HighIncome ~ Insurance + HouseCosts + ElectricBill + FamilyType + OwnRent + NumVehicles + NumBedrooms +
NumWorkers + NumPeople + NumChildren + NumUnits + FoodStamp + YearBuilt + Language + HeatingFuel,
data=acs_fit, family=binomial(link="logit"))
acs_test$svm_HighIncome <- predict(svm_mod1, newdata=acs_test, type='response')
acs_test$svm_HighIncome <- (acs_test$svm_HighIncome > 0.5) * 1
svm_cm1 <- confusionMatrix(as.factor(acs_test$svm_HighIncome), as.factor(acs_test$HighIncome), positive = "1")
svm_cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4259 231
## 1 18 41
##
## Accuracy : 0.9453
## 95% CI : (0.9383, 0.9517)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 0.07832
##
## Kappa : 0.2313
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.150735
## Specificity : 0.995791
## Pos Pred Value : 0.694915
## Neg Pred Value : 0.948552
## Prevalence : 0.059793
## Detection Rate : 0.009013
## Detection Prevalence : 0.012970
## Balanced Accuracy : 0.573263
##
## 'Positive' Class : 1
##
# Residual Analysis
summary(acs_test$HighIncome - predict(svm_mod1,newdata=acs_test))
## V1
## Min. :-1.091970
## 1st Qu.:-0.020412
## Median :-0.015125
## Mean : 0.030201
## 3rd Qu.:-0.007717
## Max. : 1.017718
svm_mod2 <- ksvm(FamilyIncome ~ Insurance + HouseCosts + ElectricBill + FamilyType + OwnRent + NumVehicles + NumBedrooms,
data=acs_fit)
acs_test$svm2_HighIncome <- predict(svm_mod2, newdata=acs_test)
acs_test$svm2_HighIncome <- ifelse(acs_test$svm2_HighIncome > 250000,1,0)
svm_cm2 <- confusionMatrix(as.factor(acs_test$svm2_HighIncome), as.factor(acs_test$HighIncome), positive = "1")
svm_cm2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4252 223
## 1 25 49
##
## Accuracy : 0.9455
## 95% CI : (0.9385, 0.9519)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 0.06934
##
## Kappa : 0.2644
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.18015
## Specificity : 0.99415
## Pos Pred Value : 0.66216
## Neg Pred Value : 0.95017
## Prevalence : 0.05979
## Detection Rate : 0.01077
## Detection Prevalence : 0.01627
## Balanced Accuracy : 0.58715
##
## 'Positive' Class : 1
##
# Residual Analysis
summary(acs_test$HighIncome - predict(svm_mod2,newdata=acs_test))
## V1
## Min. :-452624
## 1st Qu.:-119155
## Median : -88490
## Mean : -97007
## 3rd Qu.: -66964
## Max. : -7565
svm_mod3 <- ksvm(HighIncome ~ Insurance + HouseCosts + ElectricBill + FamilyType + OwnRent + NumRooms, data=acs_fit,
family=binomial(link="logit"))
acs_test$svm3_HighIncome <- predict(svm_mod3, newdata=acs_test, type='response')
acs_test$svm3_HighIncome <- (acs_test$svm3_HighIncome > 0.5) * 1
svm_cm3 <- confusionMatrix(as.factor(acs_test$svm3_HighIncome), as.factor(acs_test$HighIncome), positive = "1")
svm_cm3
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4258 224
## 1 19 48
##
## Accuracy : 0.9466
## 95% CI : (0.9396, 0.9529)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 0.03567
##
## Kappa : 0.2658
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.17647
## Specificity : 0.99556
## Pos Pred Value : 0.71642
## Neg Pred Value : 0.95002
## Prevalence : 0.05979
## Detection Rate : 0.01055
## Detection Prevalence : 0.01473
## Balanced Accuracy : 0.58601
##
## 'Positive' Class : 1
##
# Residual Analysis
summary(acs_test$HighIncome - predict(svm_mod3,newdata=acs_test))
## V1
## Min. :-1.01270
## 1st Qu.:-0.02385
## Median :-0.02340
## Mean : 0.02265
## 3rd Qu.:-0.02083
## Max. : 1.02540
“A mixed model is similar in many ways to a linear model. It estimates the effects of one or more explanatory variables on a response variable. The output of a mixed model will give you a list of explanatory values, estimates and confidence intervals of their effect sizes, p-values for each effect, and at least one measure of how well the model fits. You should use a mixed model instead of a simple linear model when you have a variable that describes your data sample as a subset of the data you could have collected” Tufts.edu (see link for source of quote)
Check out this link for a very well explained tutorial with easy to understand explanation of mixed models:
http://ase.tufts.edu/gsc/gradresources/guidetomixedmodelsinr/mixed%20model%20guide.html
I would not use this model for this data set to specify th
mms1 <- lme(HighIncome ~ Insurance + HouseCosts + ElectricBill + FoodStamp +
OwnRent, method = "ML", data = acs_fit, random =~ NumChildren | FamilyType)
summary(mms1)
## Linear mixed-effects model fit by maximum likelihood
## Data: acs_fit
## AIC BIC logLik
## -3398.435 -3312.537 1710.218
##
## Random effects:
## Formula: ~NumChildren | FamilyType
## Structure: General positive-definite, Log-Cholesky parametrization
## StdDev Corr
## (Intercept) 0.018021290 (Intr)
## NumChildren 0.002686936 -0.984
## Residual 0.220210011
##
## Fixed effects: HighIncome ~ Insurance + HouseCosts + ElectricBill + FoodStamp + OwnRent
## Value Std.Error DF t-value p-value
## (Intercept) -0.09421877 0.007771971 18187 -12.122892 0.0000
## Insurance 0.00004477 0.000002001 18187 22.375620 0.0000
## HouseCosts 0.00005145 0.000001701 18187 30.237485 0.0000
## ElectricBill 0.00012310 0.000016543 18187 7.441276 0.0000
## FoodStampYes -0.01421120 0.006544939 18187 -2.171327 0.0299
## OwnRentOutright 0.11008631 0.019637856 18187 5.605821 0.0000
## OwnRentRented 0.03884131 0.005858341 18187 6.630086 0.0000
## Correlation:
## (Intr) Insrnc HsCsts ElctrB FdStmY OwnRnO
## Insurance -0.031
## HouseCosts -0.223 -0.419
## ElectricBill -0.294 -0.173 -0.221
## FoodStampYes -0.128 -0.006 0.081 -0.036
## OwnRentOutright -0.013 -0.011 0.081 -0.014 0.001
## OwnRentRented -0.174 0.290 -0.047 -0.005 -0.245 0.027
##
## Standardized Within-Group Residuals:
## Min Q1 Med Q3 Max
## -2.79009543 -0.39900734 -0.11855030 0.05345452 4.86099519
##
## Number of Observations: 18196
## Number of Groups: 3
acs_test$mms1_HighIncome <- predict(mms1, newdata=acs_test, type='response')
acs_test$mms1_HighIncome <- (acs_test$mms1_HighIncome > 0.5) * 1
mms1_cm <- confusionMatrix(as.factor(acs_test$mms1_HighIncome), as.factor(acs_test$HighIncome), positive = "1")
mms1_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4270 257
## 1 7 15
##
## Accuracy : 0.942
## 95% CI : (0.9348, 0.9486)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 0.3221
##
## Kappa : 0.0939
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.055147
## Specificity : 0.998363
## Pos Pred Value : 0.681818
## Neg Pred Value : 0.943230
## Prevalence : 0.059793
## Detection Rate : 0.003297
## Detection Prevalence : 0.004836
## Balanced Accuracy : 0.526755
##
## 'Positive' Class : 1
##
mms1_aic <- AIC(mms1)
mms1_aic
## [1] -3398.435
mms2 <- lmer(HighIncome ~ 1 + FoodStamp + OwnRent + FamilyType + OwnRent + NumWorkers + (1 | FamilyType), data = acs_fit)
summary(mms2)
## Linear mixed model fit by REML ['lmerMod']
## Formula: HighIncome ~ 1 + FoodStamp + OwnRent + FamilyType + OwnRent +
## NumWorkers + (1 | FamilyType)
## Data: acs_fit
##
## REML criterion at convergence: -480.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -0.5198 -0.3236 -0.2944 -0.1493 4.3136
##
## Random effects:
## Groups Name Variance Std.Dev.
## FamilyType (Intercept) 0.0008017 0.02831
## Residual 0.0568581 0.23845
## Number of obs: 18196, groups: FamilyType, 3
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 0.015036 0.028962 0.519
## FoodStampYes -0.030157 0.007083 -4.258
## OwnRentOutright 0.039813 0.021262 1.873
## OwnRentRented -0.027411 0.006054 -4.528
## FamilyTypeMale Head 0.006616 0.041088 0.161
## FamilyTypeMarried 0.048180 0.040400 1.193
## NumWorkers 0.006973 0.002218 3.144
##
## Correlation of Fixed Effects:
## (Intr) FdStmY OwnRnO OwnRnR FmlTMH FmlyTM
## FoodStampYs -0.050
## OwnRntOtrgh -0.017 0.005
## OwnRentRntd -0.052 -0.253 0.034
## FmlyTypMlHd -0.693 0.007 0.002 0.007
## FmlyTypMrrd -0.704 0.023 -0.002 0.020 0.497
## NumWorkers -0.115 0.082 0.097 0.076 -0.002 -0.020
acs_test$mms2_HighIncome <- predict(mms2, newdata=acs_test, type='response')
acs_test$mms2_HighIncome <- (acs_test$mms2_HighIncome > 0.5) * 1
mms2_cm <- confusionMatrix(as.factor(acs_test$mms2_HighIncome), as.factor(acs_test$HighIncome), positive = "1")
## Warning in confusionMatrix.default(as.factor(acs_test$mms2_HighIncome), :
## Levels are not in the same order for reference and data. Refactoring data
## to match.
mms2_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4277 272
## 1 0 0
##
## Accuracy : 0.9402
## 95% CI : (0.9329, 0.9469)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 0.5161
##
## Kappa : 0
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.00000
## Specificity : 1.00000
## Pos Pred Value : NaN
## Neg Pred Value : 0.94021
## Prevalence : 0.05979
## Detection Rate : 0.00000
## Detection Prevalence : 0.00000
## Balanced Accuracy : 0.50000
##
## 'Positive' Class : 1
##
mms2_aic <- AIC(mms2)
mms2_aic
## [1] -462.3482
GAM’s let you represent nonlinear and non-montonic relationships between variables and outcome in linear or logistic regression framework
Evaluate the GAM with same measures as you would for simple linear or logistic regression.
Here is a good link with detailed explnation & a good code example:
https://stat.ethz.ch/R-manual/R-devel/library/mgcv/html/gam.html
gam_mod1 <- gam(FamilyIncome ~ s(Insurance) + s(HouseCosts) + s(ElectricBill) + NumWorkers + FamilyType +
FoodStamp + OwnRent + NumBedrooms + s(NumChildren) + s(NumRooms) + s(NumPeople) + NumVehicles +
Language, family=gaussian(link =identity), data=acs_fit)
summary(gam_mod1)
##
## Family: gaussian
## Link function: identity
##
## Formula:
## FamilyIncome ~ s(Insurance) + s(HouseCosts) + s(ElectricBill) +
## NumWorkers + FamilyType + FoodStamp + OwnRent + NumBedrooms +
## s(NumChildren) + s(NumRooms) + s(NumPeople) + NumVehicles +
## Language
##
## Parametric coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 34331.7 4910.4 6.992 2.81e-12 ***
## NumWorkers 19733.6 896.4 22.014 < 2e-16 ***
## FamilyTypeMale Head 7225.0 3155.6 2.290 0.0221 *
## FamilyTypeMarried 29536.5 1869.6 15.798 < 2e-16 ***
## FoodStampYes -12728.7 2538.3 -5.015 5.36e-07 ***
## OwnRentOutright 48514.6 7399.4 6.557 5.65e-11 ***
## OwnRentRented -6545.7 3289.9 -1.990 0.0466 *
## NumBedrooms 1270.0 776.0 1.636 0.1018
## NumVehicles 6865.2 740.6 9.270 < 2e-16 ***
## LanguageEnglish 1527.9 3389.4 0.451 0.6521
## LanguageOther -1459.9 6347.0 -0.230 0.8181
## LanguageOther European -2123.9 3829.2 -0.555 0.5791
## LanguageSpanish -9877.2 3906.8 -2.528 0.0115 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df F p-value
## s(Insurance) 7.729 8.458 74.504 < 2e-16 ***
## s(HouseCosts) 7.853 8.515 220.272 < 2e-16 ***
## s(ElectricBill) 1.000 1.000 59.394 1.36e-14 ***
## s(NumChildren) 3.825 4.639 9.296 4.13e-08 ***
## s(NumRooms) 4.031 4.910 43.928 < 2e-16 ***
## s(NumPeople) 2.342 2.974 34.244 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.351 Deviance explained = 35.2%
## GCV = 6.635e+09 Scale est. = 6.6205e+09 n = 18196
acs_test$gam_FamilyIncome <- predict(gam_mod1, newdata=acs_test)
acs_test$gam_HighIncome <- ifelse(acs_test$gam_FamilyIncome > 250000,1,0)
gam_mod1_rsq <- summary(gam_mod1)$r.sq
gam_mod_aic <- AIC(gam_mod1)
gam_cm1 <- confusionMatrix(as.factor(acs_test$gam_HighIncome), as.factor(acs_test$HighIncome), positive = "1")
gam_cm1
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4209 206
## 1 68 66
##
## Accuracy : 0.9398
## 95% CI : (0.9325, 0.9465)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 0.5656
##
## Kappa : 0.2974
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.24265
## Specificity : 0.98410
## Pos Pred Value : 0.49254
## Neg Pred Value : 0.95334
## Prevalence : 0.05979
## Detection Rate : 0.01451
## Detection Prevalence : 0.02946
## Balanced Accuracy : 0.61337
##
## 'Positive' Class : 1
##
# Residual Analysis
summary(acs_test$HighIncome - predict(gam_mod1,newdata=acs_test))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -452200 -135800 -103300 -110700 -74820 50060
gam_mod2 <- gam(HighIncome ~ s(Insurance) + s(HouseCosts) + s(ElectricBill) + NumWorkers + FamilyType + FoodStamp +
OwnRent + NumBedrooms + s(NumChildren) + s(NumRooms) + s(NumPeople) + NumVehicles + Language,
data = acs_fit, family=binomial(link="logit"))
summary(gam_mod2)
##
## Family: binomial
## Link function: logit
##
## Formula:
## HighIncome ~ s(Insurance) + s(HouseCosts) + s(ElectricBill) +
## NumWorkers + FamilyType + FoodStamp + OwnRent + NumBedrooms +
## s(NumChildren) + s(NumRooms) + s(NumPeople) + NumVehicles +
## Language
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.43414 0.31324 -17.348 < 2e-16 ***
## NumWorkers 0.19617 0.05528 3.549 0.000387 ***
## FamilyTypeMale Head 0.58537 0.32006 1.829 0.067406 .
## FamilyTypeMarried 1.37945 0.20733 6.653 2.87e-11 ***
## FoodStampYes -1.24465 0.37080 -3.357 0.000789 ***
## OwnRentOutright 2.18886 0.34034 6.431 1.26e-10 ***
## OwnRentRented -0.05986 0.25095 -0.239 0.811479
## NumBedrooms 0.04701 0.03670 1.281 0.200161
## NumVehicles 0.22197 0.04239 5.236 1.64e-07 ***
## LanguageEnglish -0.33222 0.16546 -2.008 0.044649 *
## LanguageOther -0.04632 0.28944 -0.160 0.872848
## LanguageOther European -0.21996 0.18646 -1.180 0.238115
## LanguageSpanish -0.62617 0.21181 -2.956 0.003114 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(Insurance) 8.570 8.934 183.64 < 2e-16 ***
## s(HouseCosts) 6.577 7.603 376.29 < 2e-16 ***
## s(ElectricBill) 1.452 1.781 32.08 1.36e-07 ***
## s(NumChildren) 3.112 3.796 16.59 0.00263 **
## s(NumRooms) 2.774 3.384 93.22 < 2e-16 ***
## s(NumPeople) 1.022 1.044 28.57 1.22e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.213 Deviance explained = 28.5%
## UBRE = -0.66611 Scale est. = 1 n = 18196
acs_test$gam2_HighIncome <- predict(gam_mod2, newdata=acs_test)
acs_test$gam2_HighIncome <- ifelse(acs_test$gam2_HighIncome > .5 ,1,0)
gam_mod2_rsq <- summary(gam_mod2)$r.sq
gam_mod2_aic <- AIC(gam_mod2)
gam_cm2 <- confusionMatrix(as.factor(acs_test$gam2_HighIncome), as.factor(acs_test$HighIncome), positive = "1")
gam_cm2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4259 239
## 1 18 33
##
## Accuracy : 0.9435
## 95% CI : (0.9364, 0.95)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 0.1827
##
## Kappa : 0.189
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.121324
## Specificity : 0.995791
## Pos Pred Value : 0.647059
## Neg Pred Value : 0.946865
## Prevalence : 0.059793
## Detection Rate : 0.007254
## Detection Prevalence : 0.011211
## Balanced Accuracy : 0.558557
##
## 'Positive' Class : 1
##
# Residual Analysis
summary(acs_test$HighIncome - predict(gam_mod2,newdata=acs_test))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.856 2.831 3.840 3.795 4.775 9.065
plot(gam_mod2)
This package automates the model specification process to some degree. It does so by “dredging” through all possible combinations of independent variables. You can then display “best models” and importance metrics, which can then be used to specify your model.
This link is a very detailed example of using MuMIn and the dredge function. Scsprintf(“Logistic model 6: Predicted Accuracy = %.3f Predicted Sensitivity = %.3f AIC = %.1f”,
https://sites.google.com/site/rforfishandwildlifegrads/home/mumin_usage_examples
# be aware that this takes time to run. It is worth the wait BUT frowned upon when used due to spurious results and inference
dredge_glm_mod <- glm(HighIncome ~ Insurance + HouseCosts + ElectricBill + FamilyType + FoodStamp + OwnRent +
NumBedrooms + NumRooms, family=binomial(logit), na.action = "na.fail", data=acs_fit)
dd_glm <- dredge(dredge_glm_mod)
## Fixed term is "(Intercept)"
summary(dredge_glm_mod)
##
## Call:
## glm(formula = HighIncome ~ Insurance + HouseCosts + ElectricBill +
## FamilyType + FoodStamp + OwnRent + NumBedrooms + NumRooms,
## family = binomial(logit), data = acs_fit, na.action = "na.fail")
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6709 -0.3183 -0.2205 -0.1516 3.8289
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.290e+00 2.415e-01 -30.179 < 2e-16 ***
## Insurance 3.115e-04 2.676e-05 11.637 < 2e-16 ***
## HouseCosts 5.372e-04 2.504e-05 21.455 < 2e-16 ***
## ElectricBill 2.099e-03 2.909e-04 7.215 5.39e-13 ***
## FamilyTypeMale Head 6.300e-01 3.220e-01 1.957 0.05036 .
## FamilyTypeMarried 1.578e+00 2.064e-01 7.647 2.06e-14 ***
## FoodStampYes -1.682e+00 3.695e-01 -4.552 5.32e-06 ***
## OwnRentOutright 1.926e+00 3.133e-01 6.148 7.83e-10 ***
## OwnRentRented 7.924e-02 1.965e-01 0.403 0.68683
## NumBedrooms 8.726e-02 3.367e-02 2.591 0.00956 **
## NumRooms 1.131e-01 1.403e-02 8.060 7.62e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 8398.1 on 18195 degrees of freedom
## Residual deviance: 6265.0 on 18185 degrees of freedom
## AIC: 6287
##
## Number of Fisher Scoring iterations: 8
# best supported models
subset(dd_glm, delta < 5)
## Global model call: glm(formula = HighIncome ~ Insurance + HouseCosts + ElectricBill +
## FamilyType + FoodStamp + OwnRent + NumBedrooms + NumRooms,
## family = binomial(logit), data = acs_fit, na.action = "na.fail")
## ---
## Model selection table
## (Intrc) ElctB FmlyT FdStm HsCst Insrn NmBdr NmRms OwnRn
## 256 -7.290 0.002099 + + 0.0005372 0.0003114 0.08726 0.1131 +
## 224 -7.152 0.002180 + + 0.0005430 0.0003188 0.1321 +
## df logLik AICc delta weight
## 256 11 -3132.486 6287.0 0.00 0.908
## 224 10 -3135.771 6291.6 4.57 0.092
## Models ranked by AICc(x)
# best model
subset(dd_glm, delta == 0)
## Global model call: glm(formula = HighIncome ~ Insurance + HouseCosts + ElectricBill +
## FamilyType + FoodStamp + OwnRent + NumBedrooms + NumRooms,
## family = binomial(logit), data = acs_fit, na.action = "na.fail")
## ---
## Model selection table
## (Intrc) ElctB FmlyT FdStm HsCst Insrn NmBdr NmRms OwnRn
## 256 -7.29 0.002099 + + 0.0005372 0.0003114 0.08726 0.1131 +
## df logLik AICc delta weight
## 256 11 -3132.486 6287 0 1
## Models ranked by AICc(x)
dredge_lm_mod <- lm(HighIncome ~ FamilyType + HouseCosts + Insurance + NumRooms + ElectricBill + FoodStamp +
OwnRent + NumBedrooms, data = acs_fit, na.action = na.fail)
dd_lm <- dredge(dredge_lm_mod)
## Fixed term is "(Intercept)"
summary(dredge_lm_mod)
##
## Call:
## lm(formula = HighIncome ~ FamilyType + HouseCosts + Insurance +
## NumRooms + ElectricBill + FoodStamp + OwnRent + NumBedrooms,
## data = acs_fit, na.action = na.fail)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.69682 -0.08760 -0.02763 0.01461 1.06314
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.760e-01 7.444e-03 -23.639 < 2e-16 ***
## FamilyTypeMale Head 8.939e-03 8.467e-03 1.056 0.2911
## FamilyTypeMarried 3.353e-02 4.889e-03 6.858 7.19e-12 ***
## HouseCosts 4.880e-05 1.695e-06 28.791 < 2e-16 ***
## Insurance 4.178e-05 2.004e-06 20.845 < 2e-16 ***
## NumRooms 9.480e-03 8.948e-04 10.594 < 2e-16 ***
## ElectricBill 9.443e-05 1.662e-05 5.682 1.35e-08 ***
## FoodStampYes -1.475e-02 6.523e-03 -2.261 0.0238 *
## OwnRentOutright 1.269e-01 1.957e-02 6.481 9.31e-11 ***
## OwnRentRented 4.821e-02 5.871e-03 8.211 2.34e-16 ***
## NumBedrooms 2.370e-03 1.948e-03 1.217 0.2237
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2191 on 18185 degrees of freedom
## Multiple R-squared: 0.1671, Adjusted R-squared: 0.1667
## F-statistic: 364.9 on 10 and 18185 DF, p-value: < 2.2e-16
subset(dd_lm, delta < 5)
## Global model call: lm(formula = HighIncome ~ FamilyType + HouseCosts + Insurance +
## NumRooms + ElectricBill + FoodStamp + OwnRent + NumBedrooms,
## data = acs_fit, na.action = na.fail)
## ---
## Model selection table
## (Intrc) ElctB FmlyT FdStm HsCst Insrn NmBdr NmRms
## 224 -0.1731 9.647e-05 + + 4.896e-05 4.194e-05 0.010110
## 256 -0.1760 9.443e-05 + + 4.880e-05 4.178e-05 0.002370 0.009480
## 220 -0.1756 9.499e-05 + 4.922e-05 4.193e-05 0.010110
## 252 -0.1782 9.317e-05 + 4.908e-05 4.179e-05 0.002055 0.009564
## OwnRn df logLik AICc delta weight
## 224 + 11 1809.015 -3596.0 0.00 0.457
## 256 + 12 1809.756 -3595.5 0.52 0.352
## 220 + 10 1806.639 -3593.3 2.75 0.116
## 252 + 11 1807.199 -3592.4 3.63 0.074
## Models ranked by AICc(x)
subset(dd_lm, delta == 0)
## Global model call: lm(formula = HighIncome ~ FamilyType + HouseCosts + Insurance +
## NumRooms + ElectricBill + FoodStamp + OwnRent + NumBedrooms,
## data = acs_fit, na.action = na.fail)
## ---
## Model selection table
## (Intrc) ElctB FmlyT FdStm HsCst Insrn NmRms OwnRn df
## 224 -0.1731 9.647e-05 + + 4.896e-05 4.194e-05 0.01011 + 11
## logLik AICc delta weight
## 224 1809.015 -3596 0 1
## Models ranked by AICc(x)
tree1 <- rpart(HighIncome ~ FamilyType + HouseCosts + NumWorkers2 + OwnRent + Insurance + NumWorkers2 +
YearBuilt + NumBedrooms, data=acs_fit, method="class")
rpart.plot(tree1)
##head(predict(tree1))
##head(predict(tree1, type="class"))
tree1_cm <- confusionMatrix(predict(tree1, type="class"), acs_fit$HighIncome, positive = "1")
tree1_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 16903 890
## 1 176 227
##
## Accuracy : 0.9414
## 95% CI : (0.9379, 0.9448)
## No Information Rate : 0.9386
## P-Value [Acc > NIR] : 0.05864
##
## Kappa : 0.2751
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.20322
## Specificity : 0.98969
## Pos Pred Value : 0.56328
## Neg Pred Value : 0.94998
## Prevalence : 0.06139
## Detection Rate : 0.01248
## Detection Prevalence : 0.02215
## Balanced Accuracy : 0.59646
##
## 'Positive' Class : 1
##
# Residual analysis
summary(acs_test$HighIncome - predict(tree1,newdata=acs_test))
## 0 1
## Min. :-0.9580 Min. :-0.563275
## 1st Qu.:-0.9580 1st Qu.:-0.041991
## Median :-0.9580 Median :-0.041991
## Mean :-0.8776 Mean :-0.002771
## 3rd Qu.:-0.9580 3rd Qu.:-0.041991
## Max. : 0.5633 Max. : 0.958009
tree2 <- rpart(HighIncome ~ FoodStamp + Insurance + FamilyType, data=acs_fit, method="class",
control=rpart.control(minsplit=2, cp=0))
rpart.plot(tree2)
##head(predict(tree2))
##head(predict(tree2, type="class"))
tree2_cm <- confusionMatrix(predict(tree2, type="class"), acs_fit$HighIncome, positive = "1")
tree2_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 17062 1086
## 1 17 31
##
## Accuracy : 0.9394
## 95% CI : (0.9358, 0.9428)
## No Information Rate : 0.9386
## P-Value [Acc > NIR] : 0.3397
##
## Kappa : 0.0484
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.027753
## Specificity : 0.999005
## Pos Pred Value : 0.645833
## Neg Pred Value : 0.940159
## Prevalence : 0.061387
## Detection Rate : 0.001704
## Detection Prevalence : 0.002638
## Balanced Accuracy : 0.513379
##
## 'Positive' Class : 1
##
# Residual analysis
summary(acs_test$HighIncome - predict(tree2,newdata=acs_test))
## 0 1
## Min. :-1.0000 Min. :-1.000000
## 1st Qu.:-0.9838 1st Qu.:-0.074370
## Median :-0.9572 Median :-0.022409
## Mean :-0.8779 Mean :-0.002539
## 3rd Qu.:-0.9256 3rd Qu.:-0.016161
## Max. : 0.5714 Max. : 1.000000
tree3 <- rpart(HighIncome ~ Insurance + ElectricBill + HouseCosts, data=acs_fit, method="class",
control=rpart.control(minsplit=2, cp=.005))
rpart.plot(tree3)
##head(predict(tree3))
##head(predict(tree3, type="class"))
tree3_cm <- confusionMatrix(predict(tree3, type="class"), acs_fit$HighIncome, positive = "1")
tree3_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 16991 957
## 1 88 160
##
## Accuracy : 0.9426
## 95% CI : (0.9391, 0.9459)
## No Information Rate : 0.9386
## P-Value [Acc > NIR] : 0.013
##
## Kappa : 0.217
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.143241
## Specificity : 0.994847
## Pos Pred Value : 0.645161
## Neg Pred Value : 0.946679
## Prevalence : 0.061387
## Detection Rate : 0.008793
## Detection Prevalence : 0.013629
## Balanced Accuracy : 0.569044
##
## 'Positive' Class : 1
##
# Residual analysis
summary(acs_test$HighIncome - predict(tree3,newdata=acs_test))
## 0 1
## Min. :-0.9580 Min. :-0.740260
## 1st Qu.:-0.9580 1st Qu.:-0.041991
## Median :-0.9580 Median :-0.041991
## Mean :-0.8779 Mean :-0.002525
## 3rd Qu.:-0.9580 3rd Qu.:-0.041991
## Max. : 0.7403 Max. : 0.958009
tree4 <- rpart(HighIncome ~ Insurance + ElectricBill + HouseCosts + NumBedrooms + NumChildren +
NumPeople + NumRooms + NumVehicles + NumWorkers + FoodStamp + OwnRent + ElectricBill +
HouseCosts, data=acs_fit, method="class", control=rpart.control(minsplit=2, cp=0))
rpart.plot(tree4)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
##head(predict(tree4))
##head(predict(tree4, type="class"))
tree4_cm <- confusionMatrix(predict(tree4, type="class"), acs_fit$HighIncome, positive = "1")
tree4_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 17079 1
## 1 0 1116
##
## Accuracy : 0.9999
## 95% CI : (0.9997, 1)
## No Information Rate : 0.9386
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9995
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.99910
## Specificity : 1.00000
## Pos Pred Value : 1.00000
## Neg Pred Value : 0.99994
## Prevalence : 0.06139
## Detection Rate : 0.06133
## Detection Prevalence : 0.06133
## Balanced Accuracy : 0.99955
##
## 'Positive' Class : 1
##
# Residual analysis
summary(acs_test$HighIncome - predict(tree4,newdata=acs_test))
## 0 1
## Min. :-1.0000 Min. :-1.00000
## 1st Qu.:-1.0000 1st Qu.: 0.00000
## Median :-1.0000 Median : 0.00000
## Mean :-0.8648 Mean :-0.01561
## 3rd Qu.:-1.0000 3rd Qu.: 0.00000
## Max. : 1.0000 Max. : 1.00000
tree5 <- rpart(HighIncome ~ Insurance + ElectricBill + HouseCosts + NumWorkers2, data=acs_fit,
method="class", control=rpart.control(minsplit=2, cp=.0025))
rpart.plot(tree5)
##head(predict(tree5))
##head(predict(tree5, type="class"))
tree5_cm <- confusionMatrix(predict(tree5, type="class"), acs_fit$HighIncome, positive = "1")
tree5_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 16972 920
## 1 107 197
##
## Accuracy : 0.9436
## 95% CI : (0.9401, 0.9469)
## No Information Rate : 0.9386
## P-Value [Acc > NIR] : 0.002593
##
## Kappa : 0.2578
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.17637
## Specificity : 0.99373
## Pos Pred Value : 0.64803
## Neg Pred Value : 0.94858
## Prevalence : 0.06139
## Detection Rate : 0.01083
## Detection Prevalence : 0.01671
## Balanced Accuracy : 0.58505
##
## 'Positive' Class : 1
##
# Residual analysis
summary(acs_test$HighIncome - predict(tree5,newdata=acs_test))
## 0 1
## Min. :-1.0000 Min. :-0.789474
## 1st Qu.:-0.9580 1st Qu.:-0.041991
## Median :-0.9580 Median :-0.041991
## Mean :-0.8780 Mean :-0.002413
## 3rd Qu.:-0.9580 3rd Qu.:-0.041991
## Max. : 0.7895 Max. : 0.958009
rand_bag1 <- randomForest(HighIncome ~ Insurance + ElectricBill + HouseCosts + NumBedrooms , data=acs_test, mtry=4,
importance = TRUE, na.action = na.omit)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
rand_bag1
##
## Call:
## randomForest(formula = HighIncome ~ Insurance + ElectricBill + HouseCosts + NumBedrooms, data = acs_test, mtry = 4, importance = TRUE, na.action = na.omit)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 0.05218558
## % Var explained: 7.17
# plot model
plot(rand_bag1)
#### Paper or plastic? Time to bag using fit data
bag1_fit <- predict(rand_bag1, acs_fit, type="class" )
tail(bag1_fit)
## 22739 22740 22741 22742 22743
## 1.846667e-02 6.073333e-02 1.120000e-02 2.377667e-01 -8.773537e-17
## 22744
## -8.827661e-17
acs_fit$HighIncome_bag <- ifelse(bag1_fit > .5,1,0)
bag_cm1_fit <- confusionMatrix(acs_fit$HighIncome_bag, acs_fit$HighIncome, positive = "1")
bag_cm1_fit
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 16862 930
## 1 217 187
##
## Accuracy : 0.937
## 95% CI : (0.9333, 0.9405)
## No Information Rate : 0.9386
## P-Value [Acc > NIR] : 0.827
##
## Kappa : 0.2205
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.16741
## Specificity : 0.98729
## Pos Pred Value : 0.46287
## Neg Pred Value : 0.94773
## Prevalence : 0.06139
## Detection Rate : 0.01028
## Detection Prevalence : 0.02220
## Balanced Accuracy : 0.57735
##
## 'Positive' Class : 1
##
# Residual analysis
summary(acs_test$HighIncome - predict(rand_bag1,newdata=acs_fit))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.975100 -0.066800 -0.007067 -0.009178 0.000000 1.000000
bag1_pred <- predict(rand_bag1, acs_test, type="class" )
acs_test$HighIncome_bag <- ifelse(bag1_pred > .5,1,0)
bag_cm1_pred <- confusionMatrix(acs_test$HighIncome_bag, acs_test$HighIncome, positive = "1")
bag_cm1_pred
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4277 84
## 1 0 188
##
## Accuracy : 0.9815
## 95% CI : (0.9772, 0.9852)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.808
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.69118
## Specificity : 1.00000
## Pos Pred Value : 1.00000
## Neg Pred Value : 0.98074
## Prevalence : 0.05979
## Detection Rate : 0.04133
## Detection Prevalence : 0.04133
## Balanced Accuracy : 0.84559
##
## 'Positive' Class : 1
##
# Residual analysis
summary(acs_test$HighIncome - predict(rand_bag1,newdata=acs_test))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.483500 -0.027030 -0.002400 -0.003444 0.000000 0.789600
rand_forest <- randomForest(HighIncome ~ Insurance + ElectricBill + HouseCosts + NumBedrooms + ElectricBill +
NumVehicles + NumChildren + NumPeople + FoodStamp + Language + NumRooms + OwnRent +
NumUnits, data=acs_test, mtry=12, importance = TRUE, na.action = na.omit)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
rf_fit <- predict(rand_forest, acs_fit, type="class" )
acs_fit$HighIncome_rf_fit <- ifelse(rf_fit > .5,1,0)
rf_cm_fit <- confusionMatrix(acs_fit$HighIncome_rf_fit, acs_fit$HighIncome, positive = "1")
rf_cm_fit
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 16889 930
## 1 190 187
##
## Accuracy : 0.9384
## 95% CI : (0.9349, 0.9419)
## No Information Rate : 0.9386
## P-Value [Acc > NIR] : 0.5448
##
## Kappa : 0.2264
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.16741
## Specificity : 0.98888
## Pos Pred Value : 0.49602
## Neg Pred Value : 0.94781
## Prevalence : 0.06139
## Detection Rate : 0.01028
## Detection Prevalence : 0.02072
## Balanced Accuracy : 0.57814
##
## 'Positive' Class : 1
##
rf_pred <- predict(rand_forest, acs_test, type="class" )
acs_test$HighIncome_rf <- ifelse(rf_pred > .5,1,0)
rf_cm_pred <- confusionMatrix(acs_test$HighIncome_rf, acs_test$HighIncome, positive = "1")
rf_cm_pred
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4277 44
## 1 0 228
##
## Accuracy : 0.9903
## 95% CI : (0.987, 0.993)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9069
## Mcnemar's Test P-Value : 9.022e-11
##
## Sensitivity : 0.83824
## Specificity : 1.00000
## Pos Pred Value : 1.00000
## Neg Pred Value : 0.98982
## Prevalence : 0.05979
## Detection Rate : 0.05012
## Detection Prevalence : 0.05012
## Balanced Accuracy : 0.91912
##
## 'Positive' Class : 1
##
# Residual analysis
summary(acs_test$HighIncome - predict(rand_forest,newdata=acs_test))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.388500 -0.025800 -0.002567 -0.003068 0.000000 0.623100
# make predictions using test data
tree1_pred <- predict(tree1, acs_test, type="class" )
tree2_pred <- predict(tree2, acs_test, type="class" )
tree3_pred <- predict(tree3, acs_test, type="class" )
tree4_pred <- predict(tree4, acs_test, type="class" )
tree5_pred <- predict(tree5, acs_test, type="class" )
# Confusion matrices
tree_cm1_pred <- confusionMatrix(tree1_pred, acs_test$HighIncome, positive = "1")
tree_cm2_pred <- confusionMatrix(tree2_pred, acs_test$HighIncome, positive = "1")
tree_cm3_pred <- confusionMatrix(tree3_pred, acs_test$HighIncome, positive = "1")
tree_cm4_pred <- confusionMatrix(tree4_pred, acs_test$HighIncome, positive = "1")
tree_cm5_pred <- confusionMatrix(tree5_pred, acs_test$HighIncome, positive = "1")
# function to normalize data
normalize <- function(x) {
num <- x - min(x)
denom <- max(x) - min(x)
return (num/denom)
}
# create and normalize a new data frame for knn analysis
acs_numericals <- data.frame(acs$NumBedrooms, acs$NumChildren, acs$NumPeople, acs$NumRooms, acs$NumVehicles, acs$NumWorkers, acs$HouseCosts, acs$ElectricBill, acs$Insurance)
acs_norm <- as.data.frame(lapply(acs_numericals[1:8], normalize))
acs_norm$HighIncome1 <- c(acs$HighIncome)
m <- nrow(acs_numericals)
val <- sample(1:m, size = round(m/3))
acsNorm_learn <- acs_norm[-val,]
acsNorm_valid <- acs_norm[val,]
# view new data frame to verify normalization
summary(acs_norm)
## acs.NumBedrooms acs.NumChildren acs.NumPeople acs.NumRooms
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3750 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.2500
## Median :0.3750 Median :0.0000 Median :0.0625 Median :0.3000
## Mean :0.4232 Mean :0.0751 Mean :0.0869 Mean :0.3087
## 3rd Qu.:0.5000 3rd Qu.:0.1667 3rd Qu.:0.1250 3rd Qu.:0.3500
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## acs.NumVehicles acs.NumWorkers acs.HouseCosts acs.ElectricBill
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.3333 1st Qu.:0.3333 1st Qu.:0.09117 1st Qu.:0.1710
## Median :0.3333 Median :0.6667 Median :0.16878 Median :0.2573
## Mean :0.3521 Mean :0.5815 Mean :0.20836 Mean :0.3006
## 3rd Qu.:0.5000 3rd Qu.:0.6667 3rd Qu.:0.28168 3rd Qu.:0.3782
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## HighIncome1
## Min. :0.00000
## 1st Qu.:0.00000
## Median :0.00000
## Mean :0.06107
## 3rd Qu.:0.00000
## Max. :1.00000
acs_knn1 <- knn(acsNorm_learn[,1:8], acsNorm_valid[,1:8], acsNorm_learn$HighIncome1, k=5, prob = TRUE)
##head(acs_knn1)
pcol1 <- as.character(as.numeric(acsNorm_valid$HighIncome1))
pairs(acsNorm_valid[1:8], pch = pcol1, col = c("green3", "red")
[(acsNorm_valid$HighIncome1 != acs_knn1)+1])
knn1_cm_pred <- confusionMatrix(acs_knn1, acsNorm_valid$HighIncome, positive = "1")
knn1_cm_pred
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7019 420
## 1 75 68
##
## Accuracy : 0.9347
## 95% CI : (0.9289, 0.9402)
## No Information Rate : 0.9356
## P-Value [Acc > NIR] : 0.6394
##
## Kappa : 0.192
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.139344
## Specificity : 0.989428
## Pos Pred Value : 0.475524
## Neg Pred Value : 0.943541
## Prevalence : 0.064363
## Detection Rate : 0.008969
## Detection Prevalence : 0.018860
## Balanced Accuracy : 0.564386
##
## 'Positive' Class : 1
##
acs_knn2 <- knn(acsNorm_learn[,1:4], acsNorm_valid[,1:4], acsNorm_learn$HighIncome, k=5, prob = TRUE)
##head(acs_knn2)
pcol2 <- as.character(as.numeric(acsNorm_valid$HighIncome1))
pairs(acsNorm_valid[2:5], pch = pcol2, col = c("green3", "red")
[(acsNorm_valid$HighIncome1 != acs_knn2)+1])
knn2_cm_pred <- confusionMatrix(acs_knn2, acsNorm_valid$HighIncome, positive = "1")
knn2_cm_pred
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7067 477
## 1 27 11
##
## Accuracy : 0.9335
## 95% CI : (0.9277, 0.939)
## No Information Rate : 0.9356
## P-Value [Acc > NIR] : 0.7808
##
## Kappa : 0.0328
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.022541
## Specificity : 0.996194
## Pos Pred Value : 0.289474
## Neg Pred Value : 0.936771
## Prevalence : 0.064363
## Detection Rate : 0.001451
## Detection Prevalence : 0.005012
## Balanced Accuracy : 0.509367
##
## 'Positive' Class : 1
##
acs_knn3 <- knn(acsNorm_learn[,6:8], acsNorm_valid[,6:8], acsNorm_learn$HighIncome, k=3, prob = TRUE)
##head(acs_knn3)
pcol3 <- as.character(as.numeric(acsNorm_valid$HighIncome1))
pairs(acsNorm_valid[6:8], pch = pcol3, col = c("green3", "red")
[(acsNorm_valid$HighIncome1 != acs_knn3)+1])
knn3_cm_pred <- confusionMatrix(acs_knn3, acsNorm_valid$HighIncome, positive = "1")
knn3_cm_pred
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7014 421
## 1 80 67
##
## Accuracy : 0.9339
## 95% CI : (0.9281, 0.9394)
## No Information Rate : 0.9356
## P-Value [Acc > NIR] : 0.7376
##
## Kappa : 0.1868
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.137295
## Specificity : 0.988723
## Pos Pred Value : 0.455782
## Neg Pred Value : 0.943376
## Prevalence : 0.064363
## Detection Rate : 0.008837
## Detection Prevalence : 0.019388
## Balanced Accuracy : 0.563009
##
## 'Positive' Class : 1
##
acs_knn4 <- knn(acsNorm_learn[,1:8], acsNorm_valid[,1:8], acsNorm_learn$HighIncome, k=10, prob = TRUE)
##head(acs_knn4)
pcol4 <- as.character(as.numeric(acsNorm_valid$HighIncome1))
pairs(acsNorm_valid[1:8], pch = pcol4, col = c("green3", "red")
[(acsNorm_valid$HighIncome1 != acs_knn4)+1])
knn4_cm_pred <- confusionMatrix(acs_knn4, acsNorm_valid$HighIncome, positive = "1")
knn4_cm_pred
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7050 435
## 1 44 53
##
## Accuracy : 0.9368
## 95% CI : (0.9311, 0.9422)
## No Information Rate : 0.9356
## P-Value [Acc > NIR] : 0.3475
##
## Kappa : 0.1633
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.10861
## Specificity : 0.99380
## Pos Pred Value : 0.54639
## Neg Pred Value : 0.94188
## Prevalence : 0.06436
## Detection Rate : 0.00699
## Detection Prevalence : 0.01279
## Balanced Accuracy : 0.55120
##
## 'Positive' Class : 1
##
acs_knn5 <- knn(acsNorm_learn[,1:8], acsNorm_valid[,1:8], acsNorm_learn$HighIncome, k=25, prob = TRUE)
#head(acs_knn5)
pcol5 <- as.character(as.numeric(acsNorm_valid$HighIncome1))
pairs(acsNorm_valid[1:8], pch = pcol5, col = c("green3", "red")
[(acsNorm_valid$HighIncome1 != acs_knn5)+1])
knn5_cm_pred <- confusionMatrix(acs_knn5, acsNorm_valid$HighIncome, positive = "1")
knn5_cm_pred
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7071 430
## 1 23 58
##
## Accuracy : 0.9403
## 95% CI : (0.9347, 0.9455)
## No Information Rate : 0.9356
## P-Value [Acc > NIR] : 0.05198
##
## Kappa : 0.189
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.11885
## Specificity : 0.99676
## Pos Pred Value : 0.71605
## Neg Pred Value : 0.94267
## Prevalence : 0.06436
## Detection Rate : 0.00765
## Detection Prevalence : 0.01068
## Balanced Accuracy : 0.55781
##
## 'Positive' Class : 1
##
acs_knn6 <- knn(acsNorm_learn[,1:8], acsNorm_valid[,1:8], acsNorm_learn$HighIncome, k=50, prob = TRUE)
##head(acs_knn6)
pcol6 <- as.character(as.numeric(acsNorm_valid$HighIncome1))
pairs(acsNorm_valid[1:8], pch = pcol6, col = c("green3", "red")
[(acsNorm_valid$HighIncome1 != acs_knn6)+1])
knn6_cm_pred <- confusionMatrix(acs_knn6, acsNorm_valid$HighIncome, positive = "1")
knn6_cm_pred
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 7069 432
## 1 25 56
##
## Accuracy : 0.9397
## 95% CI : (0.9341, 0.945)
## No Information Rate : 0.9356
## P-Value [Acc > NIR] : 0.07568
##
## Kappa : 0.1818
## Mcnemar's Test P-Value : < 2e-16
##
## Sensitivity : 0.114754
## Specificity : 0.996476
## Pos Pred Value : 0.691358
## Neg Pred Value : 0.942408
## Prevalence : 0.064363
## Detection Rate : 0.007386
## Detection Prevalence : 0.010683
## Balanced Accuracy : 0.555615
##
## 'Positive' Class : 1
##
normalize <- function(x) {
num <- x - min(x)
denom <- max(x) - min(x)
return (num/denom)
}
acs_numericals2 <- data.frame(acs$HouseCosts, acs$ElectricBill, acs$Insurance)
acs_norm2 <- as.data.frame(lapply(acs_numericals2[1:3], normalize))
acs_norm2$FamilyType1 <- c(acs$FamilyType)
distance <- dist(acs_norm2, method ="euclidean")
pfit <- hclust(distance, method = "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
cluster1 <- plot(pfit, labels = acs_norm2$FamilyType)
rect.hclust(pfit, k=5)
acs_norm2$groups <- cutree(pfit, k=5)
table(acs_norm2$groups)
##
## 1 2 3 4 5
## 7075 3266 1153 7641 3610
# Add additional variables ot the data set for model specification
acs_norm2$HighIncome <- c(acs$HighIncome)
# Break it into a training and test set with an 80/20 split.
set.seed(447)
testrecs <- sample(nrow(acs_norm2),0.2 * nrow(acs_norm2))
acs_norm2_test <- acs_norm2[testrecs,]
acs_norm2_fit <- acs_norm2[-testrecs,]
gam_mod_cluster <- gam(HighIncome ~ s(acs.Insurance) + s(acs.HouseCosts) + s(acs.ElectricBill) + groups,
data = acs_norm2_fit, family=binomial(link="logit"))
summary(gam_mod_cluster)
##
## Family: binomial
## Link function: logit
##
## Formula:
## HighIncome ~ s(acs.Insurance) + s(acs.HouseCosts) + s(acs.ElectricBill) +
## groups
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.47032 0.08509 -40.784 <2e-16 ***
## groups 0.03128 0.02440 1.282 0.2
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(acs.Insurance) 8.499 8.911 221.90 < 2e-16 ***
## s(acs.HouseCosts) 7.135 8.093 481.71 < 2e-16 ***
## s(acs.ElectricBill) 1.619 2.021 48.26 3.8e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.177 Deviance explained = 23.3%
## UBRE = -0.64375 Scale est. = 1 n = 18196
acs_norm2_test$gam_cluster_HighIncome <- predict(gam_mod_cluster, newdata=acs_norm2_test)
acs_norm2_test$gam_cluster_HighIncome <- ifelse(acs_norm2_test$gam_cluster_HighIncome > .5 ,1,0)
gam_cluster_cm <- confusionMatrix(as.factor(acs_norm2_test$gam_cluster_HighIncome), as.factor(acs_norm2_test$HighIncome),
positive = "1")
gam_cluster_adjR <- summary(gam_mod_cluster)$r.sq
gam_cluster_aic <- AIC(gam_mod_cluster)
gam_cluster_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4258 247
## 1 19 25
##
## Accuracy : 0.9415
## 95% CI : (0.9343, 0.9482)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 0.3685
##
## Kappa : 0.144
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.091912
## Specificity : 0.995558
## Pos Pred Value : 0.568182
## Neg Pred Value : 0.945172
## Prevalence : 0.059793
## Detection Rate : 0.005496
## Detection Prevalence : 0.009672
## Balanced Accuracy : 0.543735
##
## 'Positive' Class : 1
##
gam_cluster_adjR
## [1] 0.1772829
gam_cluster_aic
## [1] 6482.354
acs_numericals3 <- data.frame(acs$HouseCosts, acs$ElectricBill, acs$Insurance, acs$NumBedrooms,
acs$NumChildren, acs$NumWorkers, acs$NumRooms)
acs_norm3 <- as.data.frame(lapply(acs_numericals3[1:7], normalize))
distance2 <- dist(acs_norm3, method ="euclidean")
pfit2 <- hclust(distance2, method = "ward")
## The "ward" method has been renamed to "ward.D"; note new "ward.D2"
cluster2 <- plot(pfit2)
rect.hclust(pfit2, k=5)
acs_norm3$groups <- cutree(pfit2, k=15)
table(acs_norm3$groups)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 1606 306 2402 932 1460 1166 1550 617 2387 2754 1089 2855 1623 1723 275
# Add additional variables ot the data set for model specification
acs_norm3$HighIncome <- c(acs$HighIncome)
acs_norm3$FamilyType <- c(acs$FamilyType)
acs_norm3$Foodstamp <- c(acs$FoodStamp)
acs_norm3$OwnRent <- c(acs$OwnRent)
# Break it into a training and test set with an 80/20 split.
set.seed(447)
testrecs <- sample(nrow(acs_norm3),0.2 * nrow(acs_norm3))
acs_norm3_test <- acs_norm3[testrecs,]
acs_norm3_fit <- acs_norm3[-testrecs,]
gam_mod_cluster2 <- gam(HighIncome ~ s(acs.Insurance) + s(acs.HouseCosts) + s(acs.ElectricBill) +
acs.NumBedrooms + s(acs.NumChildren) + acs.NumWorkers + s(acs.NumRooms) +
FamilyType + Foodstamp + OwnRent + groups, data = acs_norm3_fit,
family=binomial(link="logit"))
summary(gam_mod_cluster2)
##
## Family: binomial
## Link function: logit
##
## Formula:
## HighIncome ~ s(acs.Insurance) + s(acs.HouseCosts) + s(acs.ElectricBill) +
## acs.NumBedrooms + s(acs.NumChildren) + acs.NumWorkers + s(acs.NumRooms) +
## FamilyType + Foodstamp + OwnRent + groups
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.59167 0.52466 -8.752 < 2e-16 ***
## acs.NumBedrooms 0.30668 0.28764 1.066 0.2863
## acs.NumWorkers -0.36105 0.20503 -1.761 0.0783 .
## FamilyType 0.77413 0.09801 7.899 2.82e-15 ***
## Foodstamp -1.64131 0.36794 -4.461 8.17e-06 ***
## OwnRent 0.06540 0.12115 0.540 0.5893
## groups 0.07053 0.01507 4.679 2.88e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(acs.Insurance) 8.529 8.921 212.986 < 2e-16 ***
## s(acs.HouseCosts) 6.567 7.595 288.269 < 2e-16 ***
## s(acs.ElectricBill) 1.487 1.833 35.341 2.79e-08 ***
## s(acs.NumChildren) 3.022 3.700 9.904 0.0449 *
## s(acs.NumRooms) 2.896 3.530 108.486 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.21 Deviance explained = 27.8%
## UBRE = -0.66332 Scale est. = 1 n = 18196
acs_norm3_test$gam_cluster_HighIncome <- predict(gam_mod_cluster2, newdata=acs_norm3_test)
acs_norm3_test$gam_cluster_HighIncome <- ifelse(acs_norm3_test$gam_cluster_HighIncome > .5 ,1,0)
gam_cluster_cm2 <- confusionMatrix(as.factor(acs_norm3_test$gam_cluster_HighIncome), as.factor(acs_norm3_test$HighIncome),
positive = "1")
gam_cluster2_adjR <- summary(gam_mod_cluster2)$r.sq
gam_cluster2_aic <- AIC(gam_mod_cluster2)
gam_cluster_cm2
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 4258 237
## 1 19 35
##
## Accuracy : 0.9437
## 95% CI : (0.9366, 0.9502)
## No Information Rate : 0.9402
## P-Value [Acc > NIR] : 0.1663
##
## Kappa : 0.1989
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.128676
## Specificity : 0.995558
## Pos Pred Value : 0.648148
## Neg Pred Value : 0.947275
## Prevalence : 0.059793
## Detection Rate : 0.007694
## Detection Prevalence : 0.011871
## Balanced Accuracy : 0.562117
##
## 'Positive' Class : 1
##
gam_cluster2_adjR
## [1] 0.2100819
gam_cluster2_aic
## [1] 6126.215
Display important metrics for each model
sprintf("LOGISTIC REGRESSION")
## [1] "LOGISTIC REGRESSION"
sprintf("Logistic model 1: Predicted Accuracy = %.4f Predicted Sensitivity = %.3f AIC = %.1f",
log_cm1$overall['Accuracy'], log_cm1$byClass['Sensitivity'], log_mod1_aic)
## [1] "Logistic model 1: Predicted Accuracy = 0.6324 Predicted Sensitivity = 0.790 AIC = 7118.0"
sprintf("Logistic model 2: Predicted Accuracy = %.4f Predicted Sensitivity = %.3f AIC = %.1f",
log_cm2$overall['Accuracy'], log_cm2$byClass['Sensitivity'], log_mod2_aic)
## [1] "Logistic model 2: Predicted Accuracy = 0.3317 Predicted Sensitivity = 0.897 AIC = 8049.1"
sprintf("Logistic model 3: Predicted Accuracy = %.4f Predicted Sensitivity = %.3f AIC = %.1f",
log_cm3$overall['Accuracy'], log_cm3$byClass['Sensitivity'], log_mod3_aic)
## [1] "Logistic model 3: Predicted Accuracy = 0.7232 Predicted Sensitivity = 0.735 AIC = 7319.5"
sprintf("Logistic model 4: Predicted Accuracy = %.4f Predicted Sensitivity = %.3f AIC = %.1f",
log_cm4$overall['Accuracy'], log_cm4$byClass['Sensitivity'], log_mod4_aic)
## [1] "Logistic model 4: Predicted Accuracy = 0.7191 Predicted Sensitivity = 0.732 AIC = 7412.8"
sprintf("Logistic model 5: Predicted Accuracy = %.4f Predicted Sensitivity = %.3f AIC = %.1f",
log_cm5$overall['Accuracy'], log_cm5$byClass['Sensitivity'], log_mod5_aic)
## [1] "Logistic model 5: Predicted Accuracy = 0.7402 Predicted Sensitivity = 0.816 AIC = 6188.7"
sprintf("Logistic model 6: Predicted Accuracy = %.4f Predicted Sensitivity = %.3f AIC = %.1f",
log_cm6$overall['Accuracy'], log_cm6$byClass['Sensitivity'], log_mod6_aic)
## [1] "Logistic model 6: Predicted Accuracy = 0.7270 Predicted Sensitivity = 0.798 AIC = 6374.8"
sprintf("Logistic model 6: Predicted Accuracy = %.4f Predicted Sensitivity = %.3f AIC = %.1f",
log_cm6$overall['Accuracy'], log_cm6$byClass['Sensitivity'], log_mod6_aic)
## [1] "Logistic model 6: Predicted Accuracy = 0.7270 Predicted Sensitivity = 0.798 AIC = 6374.8"
sprintf(" ")
## [1] " "
sprintf("LINEAR REGRESSION")
## [1] "LINEAR REGRESSION"
sprintf("Linear model 1: Predicted Accuracy = %.4f Predicted Sensitivity = %.3f Adj R-squared = %.3f",
linear_cm1$overall['Accuracy'], linear_cm1$byClass['Sensitivity'], linear_mod1_rsq)
## [1] "Linear model 1: Predicted Accuracy = 0.9444 Predicted Sensitivity = 0.243 Adj R-squared = 0.314"
sprintf("Linear model 2: Predicted Accuracy = %.4f Predicted Sensitivity = %.3f Adj R-squared = %.3f",
linear_cm2$overall['Accuracy'], linear_cm2$byClass['Sensitivity'], linear_mod2_rsq)
## [1] "Linear model 2: Predicted Accuracy = 0.9435 Predicted Sensitivity = 0.239 Adj R-squared = 0.341"
sprintf(" ")
## [1] " "
sprintf("SUPPORT VECTOR MACHINES")
## [1] "SUPPORT VECTOR MACHINES"
sprintf("SVM model 1: Predicted Accuracy = %.4f Predicted Sensitivity = %.3f",
svm_cm1$overall['Accuracy'], svm_cm1$byClass['Sensitivity'])
## [1] "SVM model 1: Predicted Accuracy = 0.9453 Predicted Sensitivity = 0.151"
sprintf("SVM model 2: Predicted Accuracy = %.4f Predicted Sensitivity = %.3f",
svm_cm2$overall['Accuracy'], svm_cm2$byClass['Sensitivity'])
## [1] "SVM model 2: Predicted Accuracy = 0.9455 Predicted Sensitivity = 0.180"
sprintf("SVM model 3: Predicted Accuracy = %.4f Predicted Sensitivity = %.3f",
svm_cm3$overall['Accuracy'], svm_cm3$byClass['Sensitivity'])
## [1] "SVM model 3: Predicted Accuracy = 0.9466 Predicted Sensitivity = 0.176"
sprintf(" ")
## [1] " "
sprintf("MULTILEVEL MODEL SPECIFICATION")
## [1] "MULTILEVEL MODEL SPECIFICATION"
sprintf("MMS model 1: Predicted Accuracy = NA Predicted Sensitivity = NA AIC = %.1f", mms1_aic)
## [1] "MMS model 1: Predicted Accuracy = NA Predicted Sensitivity = NA AIC = -3398.4"
sprintf("MMS model 2: Predicted Accuracy = NA Predicted Sensitivity = NA AIC = %.1f", mms2_aic)
## [1] "MMS model 2: Predicted Accuracy = NA Predicted Sensitivity = NA AIC = -462.3"
sprintf(" ")
## [1] " "
sprintf("GENERALIZED ADDITIVE MODELS")
## [1] "GENERALIZED ADDITIVE MODELS"
sprintf("GAM model 1: Predicted Accuracy = %.4f Predicted Sensitivity = %.3f AIC = %.1f",
gam_cm1$overall['Accuracy'], gam_cm1$byClass['Sensitivity'], gam_mod_aic)
## [1] "GAM model 1: Predicted Accuracy = 0.9398 Predicted Sensitivity = 0.243 AIC = 463153.9"
sprintf("GAM model 2: Predicted Accuracy = %.4f Predicted Sensitivity = %.3f AIC = %.1f",
gam_cm2$overall['Accuracy'], gam_cm2$byClass['Sensitivity'], gam_mod2_aic)
## [1] "GAM model 2: Predicted Accuracy = 0.9435 Predicted Sensitivity = 0.121 AIC = 6075.4"
sprintf(" ")
## [1] " "
sprintf("DECISION TREES")
## [1] "DECISION TREES"
sprintf("Tree 1: Fit Accuracy = %.4f Predicted Accuracy = %.4f Predicted Sensitivity = %.4f",tree1_cm$overall['Accuracy'],
tree_cm1_pred$overall['Accuracy'], tree_cm1_pred$byClass['Sensitivity'])
## [1] "Tree 1: Fit Accuracy = 0.9414 Predicted Accuracy = 0.9422 Predicted Sensitivity = 0.2169"
sprintf("Tree 2: Fit Accuracy = %.4f Predicted Accuracy = %.4f Predicted Sensitivity = %.4f",tree2_cm$overall['Accuracy'],
tree_cm2_pred$overall['Accuracy'], tree_cm2_pred$byClass['Sensitivity'])
## [1] "Tree 2: Fit Accuracy = 0.9394 Predicted Accuracy = 0.9391 Predicted Sensitivity = 0.0037"
sprintf("Tree 3: Fit Accuracy = %.4f Predicted Accuracy = %.4f Predicted Sensitivity = %.4f",tree3_cm$overall['Accuracy'],
tree_cm3_pred$overall['Accuracy'], tree_cm3_pred$byClass['Sensitivity'])
## [1] "Tree 3: Fit Accuracy = 0.9426 Predicted Accuracy = 0.9426 Predicted Sensitivity = 0.1471"
sprintf("Tree 4: Fit Accuracy = %.4f Predicted Accuracy = %.4f Predicted Sensitivity = %.4f",tree4_cm$overall['Accuracy'],
tree_cm4_pred$overall['Accuracy'], tree_cm4_pred$byClass['Sensitivity'])
## [1] "Tree 4: Fit Accuracy = 0.9999 Predicted Accuracy = 0.8965 Predicted Sensitivity = 0.2647"
sprintf("Tree 5: Fit Accuracy = %.4f Predicted Accuracy = %.4f Predicted Sensitivity = %.4f",tree5_cm$overall['Accuracy'],
tree_cm5_pred$overall['Accuracy'], tree_cm5_pred$byClass['Sensitivity'])
## [1] "Tree 5: Fit Accuracy = 0.9436 Predicted Accuracy = 0.9424 Predicted Sensitivity = 0.1654"
sprintf("Bag 1: Fit Accuracy = %.4f Predicted Accuracy = %.4f Predicted Sensitivity = %.4f",bag_cm1_fit$overall['Accuracy'],
bag_cm1_pred$overall['Accuracy'], bag_cm1_pred$byClass['Sensitivity'])
## [1] "Bag 1: Fit Accuracy = 0.9370 Predicted Accuracy = 0.9815 Predicted Sensitivity = 0.6912"
sprintf("RF: Fit Accuracy = %.4f Predicted Accuracy = %.4f Predicted Sensitivity = %.4f",rf_cm_fit$overall['Accuracy'],
rf_cm_pred$overall['Accuracy'], rf_cm_pred$byClass['Sensitivity'])
## [1] "RF: Fit Accuracy = 0.9384 Predicted Accuracy = 0.9903 Predicted Sensitivity = 0.8382"
sprintf(" ")
## [1] " "
sprintf("K NEAREST NEIGHBOR")
## [1] "K NEAREST NEIGHBOR"
sprintf("Knn 1: Predicted Accuracy = %.4f Predicted Sensitivity = %.4f", knn1_cm_pred$overall['Accuracy'],
knn1_cm_pred$byClass['Sensitivity'])
## [1] "Knn 1: Predicted Accuracy = 0.9347 Predicted Sensitivity = 0.1393"
sprintf("Knn 2: Predicted Accuracy = %.4f Predicted Sensitivity = %.4f", knn2_cm_pred$overall['Accuracy'],
knn2_cm_pred$byClass['Sensitivity'])
## [1] "Knn 2: Predicted Accuracy = 0.9335 Predicted Sensitivity = 0.0225"
sprintf("Knn 3: Predicted Accuracy = %.4f Predicted Sensitivity = %.4f", knn3_cm_pred$overall['Accuracy'],
knn3_cm_pred$byClass['Sensitivity'])
## [1] "Knn 3: Predicted Accuracy = 0.9339 Predicted Sensitivity = 0.1373"
sprintf("Knn 4: Predicted Accuracy = %.4f Predicted Sensitivity = %.4f", knn4_cm_pred$overall['Accuracy'],
knn4_cm_pred$byClass['Sensitivity'])
## [1] "Knn 4: Predicted Accuracy = 0.9368 Predicted Sensitivity = 0.1086"
sprintf("Knn 5: Predicted Accuracy = %.4f Predicted Sensitivity = %.4f", knn5_cm_pred$overall['Accuracy'],
knn5_cm_pred$byClass['Sensitivity'])
## [1] "Knn 5: Predicted Accuracy = 0.9403 Predicted Sensitivity = 0.1189"
sprintf("Knn 6: Predicted Accuracy = %.4f Predicted Sensitivity = %.4f", knn6_cm_pred$overall['Accuracy'],
knn6_cm_pred$byClass['Sensitivity'])
## [1] "Knn 6: Predicted Accuracy = 0.9397 Predicted Sensitivity = 0.1148"
sprintf(" ")
## [1] " "
sprintf("CLUSTER ANALYSIS USED IN MODELS")
## [1] "CLUSTER ANALYSIS USED IN MODELS"
sprintf("GAM cluster 1: Predicted Accuracy = %.3f Predicted Sensitivity = %.3f AIC = %.1f",
gam_cluster_cm$overall['Accuracy'], gam_cluster_cm$byClass['Sensitivity'], gam_cluster_aic)
## [1] "GAM cluster 1: Predicted Accuracy = 0.942 Predicted Sensitivity = 0.092 AIC = 6482.4"
sprintf("GAM cluster 2: Predicted Accuracy = %.3f Predicted Sensitivity = %.3f AIC = %.1f",
gam_cluster_cm2$overall['Accuracy'], gam_cluster_cm2$byClass['Sensitivity'], gam_cluster2_aic)
## [1] "GAM cluster 2: Predicted Accuracy = 0.944 Predicted Sensitivity = 0.129 AIC = 6126.2"
+ Fit accuracy does not result in in most accurate predictions - Decision tree 4 is a good example of overfitting
+ Until a model is tested on new data, a decision on which model to use is risky
+ Logistic models were very ineffective, which was surprising to me
+ I had better luck using linear regression to estimate family income, converting the estimates to a binary, and testing the model than using logistic regression
+ The more complicated models can lead to weak predictive power
+ Interestingly, my best model for hw4 was a relatively simple model using the "workhorse" method of linear regression
+ Dredge is a useful function for exploratory purposes in determing the "best" variables to include in the model
+ SVM was the best performing regression method
+ The most effective model/method for predicting with this data set was a Random Forest
+ This does not mean that Random Forests will always perform the best because each data set is unique
+ Different variables, ranges, topics, and purposes could lead to different methods being more effective than others
+ As a result, I recommend exploring all methods when specifying the "best" predictive model because what holds for one data set may not hold for another